From 99dbcc08e2fa2ea3cd96dc344d8e882e97433242 Mon Sep 17 00:00:00 2001
From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com>
Date: Tue, 24 Feb 2026 05:20:21 +0000
Subject: [PATCH] fix: replace regex code block parser with stateful
 stack-based parser (#983)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the single-regex fenced code block parser with a line-by-line
stateful parser that uses a stack to track fence depth. This fixes the
asymmetry where ```language was recognized as an opening fence but
```text was not recognized as a closing fence.

Key changes:
- Line classification: FENCE_INFO (```lang → always opening),
  FENCE_PURE (``` → close at depth>0, open at depth 0),
  FENCE_DIRTY (```text → dirty close at depth>0, plain text at depth 0)
- Stack-based depth tracking with fence char/length matching
- Level-0 only emission: nested fences become content of outer block
- Two-phase architecture: Phase 1 = fence parsing, Phase 2 = HTML/SVG
  extraction from markdown segments
- Dirty close handling: trailing text after closing backticks becomes
  the start of the next markdown block

Co-authored-by: Enrico Ros <enricoros@users.noreply.github.com>
---
 src/modules/blocks/blocks.textparser.ts | 177 ++++++++++++++++++++----
 1 file changed, 153 insertions(+), 24 deletions(-)

diff --git a/src/modules/blocks/blocks.textparser.ts b/src/modules/blocks/blocks.textparser.ts
index 5b3475a25..a4ef1962a 100644
--- a/src/modules/blocks/blocks.textparser.ts
+++ b/src/modules/blocks/blocks.textparser.ts
@@ -15,25 +15,165 @@ export function parseBlocksFromText(text: string): RenderBlockInputs {
   if (imageBkInputs)
     return imageBkInputs;
 
+  // Phase 1: Parse fenced code blocks with stack-based state tracking
+  const phase1Blocks = _parseFencedBlocks(text);
+
+  // Phase 2: Extract HTML/SVG blocks from markdown segments
+  const blocks: RenderBlockInputs = [];
+  for (const block of phase1Blocks) {
+    if (block.bkt === 'md-bk')
+      _extractHtmlSvgBlocks(block.content, blocks);
+    else
+      blocks.push(block);
+  }
+
+  return blocks;
+}
+
+
+// Fence line detection: matches lines starting with 3+ backticks or tildes
+const _fenceLineRegex = /^(`{3,}|~{3,})(.*)/;
+
+/**
+ * Phase 1: Line-by-line stateful parser with stack-based fence depth tracking.
+ *
+ * Classification of fence lines:
+ * - FENCE_INFO (```python):  single-word after fence → always an opening signal
+ * - FENCE_PURE (```):        only whitespace after fence → close at depth > 0, open at depth 0
+ * - FENCE_DIRTY (```Text..): multi-word after fence → dirty close at depth > 0, plain text at depth 0
+ *
+ * Stack rules:
+ * - Openings push {char, len} onto the stack
+ * - Closes pop if fence char and length (≥ opening) match the top of stack
+ * - Only level-0 blocks are emitted; nested fences become content of the outer block
+ */
+function _parseFencedBlocks(text: string): RenderBlockInputs {
+  const lines = text.split('\n');
+  const blocks: RenderBlockInputs = [];
+
+  // Stack of open fence blocks: each entry tracks the fence character and length
+  const stack: { fenceChar: string; fenceLen: number }[] = [];
+
+  // Accumulation buffers
+  let mdLines: string[] = [];   // markdown text lines (depth 0)
+  let codeLines: string[] = []; // code content lines (depth > 0)
+  let codeTitle = '';            // info string of the outermost code block
+
+  // Flush accumulated markdown lines as a md-bk block
+  const flushMarkdown = () => {
+    if (mdLines.length > 0) {
+      const content = mdLines.join('\n');
+      if (content)
+        blocks.push({ bkt: 'md-bk', content });
+      mdLines = [];
+    }
+  };
+
+  // Emit a code-bk block from accumulated code lines
+  const emitCodeBlock = (isPartial: boolean) => {
+    // Trim trailing whitespace-only characters from the last line, preserving leading spaces
+    let code = codeLines.join('\n');
+    code = code.replace(/[\t ]+$/, '');
+    blocks.push({
+      bkt: 'code-bk',
+      title: codeTitle,
+      code,
+      lines: countLines(code),
+      isPartial,
+    });
+    codeLines = [];
+    codeTitle = '';
+  };
+
+  for (const line of lines) {
+    const depth = stack.length;
+    const fenceMatch = line.match(_fenceLineRegex);
+
+    if (depth === 0) {
+      // --- At top level: looking for code block openings ---
+      if (fenceMatch) {
+        const fenceStr = fenceMatch[1];
+        const afterFence = fenceMatch[2].trim();
+        // Single non-whitespace token = info string (language tag); empty = pure fence
+        const isSingleToken = afterFence !== '' && !/\s/.test(afterFence);
+
+        if (afterFence === '' || isSingleToken) {
+          // Opening fence: flush any preceding markdown and start a code block
+          flushMarkdown();
+          stack.push({ fenceChar: fenceStr[0], fenceLen: fenceStr.length });
+          codeTitle = afterFence;
+          codeLines = [];
+        } else {
+          // Multi-word text after backticks at depth 0 → not a fence, just markdown
+          mdLines.push(line);
+        }
+      } else {
+        mdLines.push(line);
+      }
+    } else {
+      // --- Inside a code block: looking for closes or nested opens ---
+      if (fenceMatch) {
+        const fenceStr = fenceMatch[1];
+        const fenceChar = fenceStr[0];
+        const fenceLen = fenceStr.length;
+        const afterFence = fenceMatch[2].trim();
+        const isSingleToken = afterFence !== '' && !/\s/.test(afterFence);
+        const top = stack[stack.length - 1];
+
+        if (isSingleToken) {
+          // Single-word info string → nested opening (e.g., ```json inside ```markdown)
+          // Track depth so the outer block absorbs this content
+          stack.push({ fenceChar, fenceLen });
+          codeLines.push(line);
+        } else if (fenceChar === top.fenceChar && fenceLen >= top.fenceLen) {
+          // Fence matches the top of stack → close attempt (pure or dirty)
+          stack.pop();
+          if (stack.length === 0) {
+            // Back to depth 0: emit the code block
+            emitCodeBlock(false);
+            // For dirty closes, trailing text becomes the start of the next markdown block
+            if (afterFence)
+              mdLines.push(afterFence);
+          } else {
+            // Still nested: this closing fence is content of the outer block
+            codeLines.push(line);
+          }
+        } else {
+          // Fence doesn't match top of stack (wrong char or too short) → just content
+          codeLines.push(line);
+        }
+      } else {
+        // Regular text line inside code block
+        codeLines.push(line);
+      }
+    }
+  }
+
+  // End of text: flush remaining state
+  if (stack.length > 0)
+    emitCodeBlock(true);
+  flushMarkdown();
+
+  return blocks;
+}
+
+
+/**
+ * Phase 2: Extract HTML document and SVG blocks from markdown text segments.
+ * Splits a markdown text into sub-blocks, extracting `<!DOCTYPE html>...</html>`
+ * and `<svg ...>...</svg>` as code-bk blocks.
+ */
+function _extractHtmlSvgBlocks(text: string, blocks: RenderBlockInputs): void {
   const regexPatterns = {
-    // was: \w\x20\\.+-_ for tge filename, but was missing too much
-    // REVERTED THIS: was: (`{3,}\n?|$), but was matching backticks within blocks. so now it must end with a newline or stop
-    // This was the longest in use, and still we're based on it
-    // codeBlock: /`{3,}([\S\x20]+)?\n([\s\S]*?)(`{3,}\n?|$)/g,
-    // This is way more promising, but will either not perform a partial match (no match at all) or match a single line
-    // codeBlock: /^( {0,3})`{3,}([^\n`]*)\n([\s\S]*?)(?:\n^\1`{3,}[^\S\n]*(?=\n|$))?/gm,
-    // codeBlock: /`{3,}([^\n`]*)\n([\s\S]*?)(`{3,}(?=[ ]*(?:\n|$))|$)/g, // #983
-    codeBlock: /`{3,}([^\n`]*)\n([\s\S]*?)(`{3,}(?=[ *\n]|$)|$)/g,
     htmlCodeBlock: /<!DOCTYPE html>([\s\S]*?)<\/html>/gi,
     svgBlock: /<svg (xmlns|width|viewBox)=([\s\S]*?)<\/svg>/g,
   };
 
-  const blocks: RenderBlockInputs = [];
   let lastIndex = 0;
 
   while (true) {
 
-    // find the first match (if any) trying all the regexes
+    // Find the earliest match across all patterns
     let match: RegExpExecArray | null = null;
     let matchType: keyof typeof regexPatterns | null = null;
     let earliestMatchIndex: number | null = null;
@@ -51,20 +191,12 @@ export function parseBlocksFromText(text: string): RenderBlockInputs {
     if (match === null)
       break;
 
-    // anything leftover before the match is text
+    // Text before the match
     if (match.index > lastIndex)
       blocks.push({ bkt: 'md-bk', content: text.slice(lastIndex, match.index) });
 
-    // add the block
+    // Emit the matched block
     switch (matchType) {
-      case 'codeBlock':
-        const blockTitle: string = (match[1] || '').trim();
-        // note: we don't trim blockCode to preserve leading spaces, however if the last line is only made of spaces or tabs, we trim that
-        const blockCode: string = match[2].replace(/[\t ]+$/, '');
-        const blockEnd: string = match[3];
-        blocks.push({ bkt: 'code-bk', title: blockTitle, code: blockCode, lines: countLines(blockCode), isPartial: !blockEnd.startsWith('```') });
-        break;
-
       case 'htmlCodeBlock':
         const preMatchHtml: string = `<!DOCTYPE html>${match[1]}</html>`;
         blocks.push({ bkt: 'code-bk', title: 'html', code: preMatchHtml, lines: countLines(preMatchHtml), isPartial: false });
@@ -75,13 +207,10 @@ export function parseBlocksFromText(text: string): RenderBlockInputs {
         break;
     }
 
-    // advance the pointer
     lastIndex = match.index + match[0].length;
   }
 
-  // remainder is text
+  // Remainder (or entire text if no matches) is markdown text
   if (lastIndex < text.length)
     blocks.push({ bkt: 'md-bk', content: text.slice(lastIndex) });
-
-  return blocks;
 }