fix(editor): avoid native paste when html drops raw tags

fix(editor): prefer rich html paste when semantic
fix(editor): preserve raw html-like text on paste
2026-06-28 01:49:18 +02:00 · 2026-05-27 16:59:02 +08:00 · 2026-05-27 16:52:54 +08:00 · 2026-05-27 16:37:44 +08:00 · 2026-05-27 16:03:53 +08:00 · 2026-05-27 15:28:10 +08:00
2 changed files with 518 additions and 7 deletions
--- a/packages/views/editor/extensions/markdown-paste.test.ts
+++ b/packages/views/editor/extensions/markdown-paste.test.ts
@@ -2,7 +2,10 @@ import { describe, it, expect, afterEach, vi } from "vitest";
 import { Editor } from "@tiptap/core";
 import StarterKit from "@tiptap/starter-kit";
 import { Markdown } from "@tiptap/markdown";
-import { createMarkdownPasteExtension } from "./markdown-paste";
+import {
+  createMarkdownPasteExtension,
+  escapeRawHtmlTagsOutsideCode,
+} from "./markdown-paste";

 interface FakeClipboard {
  files: never[];
@@ -141,6 +144,93 @@ describe("markdownPaste — code block context", () => {
    expect(types).toContain("heading");
  });

+  it("lets semantic rich HTML paste natively instead of reparsing plain text as Markdown", () => {
+    editor = makeEditor({
+      type: "doc",
+      content: [{ type: "paragraph" }],
+    });
+
+    editor.commands.setTextSelection(1);
+    const parseSpy = vi.spyOn(editor.markdown!, "parse");
+
+    const text =
+      "viewFiltersToApiParams(filters) maps to Partial<ListIssuesParams>.";
+    const html =
+      "<p><code>viewFiltersToApiParams(filters)</code> maps to " +
+      "<code>Partial&lt;ListIssuesParams&gt;</code>.</p>";
+
+    const handled = paste(editor, text, html);
+    expect(handled).toBe(false);
+    expect(parseSpy).not.toHaveBeenCalled();
+  });
+
+  it("lets list and emphasis HTML paste natively", () => {
+    editor = makeEditor({
+      type: "doc",
+      content: [{ type: "paragraph" }],
+    });
+
+    editor.commands.setTextSelection(1);
+    const parseSpy = vi.spyOn(editor.markdown!, "parse");
+
+    const handled = paste(
+      editor,
+      "Done\nCreated filters.ts",
+      "<ul><li><strong>Done</strong></li><li>Created filters.ts</li></ul>",
+    );
+
+    expect(handled).toBe(false);
+    expect(parseSpy).not.toHaveBeenCalled();
+  });
+
+  it("does not paste rich HTML natively when its text would drop raw tag-like lines", () => {
+    editor = makeEditor({
+      type: "doc",
+      content: [{ type: "paragraph" }],
+    });
+
+    editor.commands.setTextSelection(1);
+
+    const text =
+      "<t>\n\n裸 `<tag>` 做转\n\n<tag>\n\n" +
+      "<t>\n\n裸 `<tag>` 做转\n\n<tag>";
+    const html =
+      "<div><t></t></div>" +
+      "<p>裸 <code>&lt;tag&gt;</code> 做转</p>" +
+      "<div><tag></tag></div>" +
+      "<div><t></t></div>" +
+      "<p>裸 <code>&lt;tag&gt;</code> 做转</p>" +
+      "<div><tag></tag></div>";
+
+    const handled = paste(editor, text, html);
+    expect(handled).toBe(true);
+
+    const editorText = editor.getText();
+    expect(editorText.match(/<t>/g)).toHaveLength(2);
+    expect(editorText.match(/<tag>/g)).toHaveLength(4);
+    expect(editorText.match(/裸/g)).toHaveLength(2);
+  });
+
+  it("still parses Markdown when HTML is only a syntax-highlight wrapper", () => {
+    editor = makeEditor({
+      type: "doc",
+      content: [{ type: "paragraph" }],
+    });
+
+    editor.commands.setTextSelection(1);
+
+    const handled = paste(
+      editor,
+      "# Heading\n\nbody",
+      '<pre><code><span style="color: #888"># Heading</span><br><br>body</code></pre>',
+    );
+
+    expect(handled).toBe(true);
+    const json = editor.getJSON() as JsonNode;
+    const types = (json.content ?? []).map((n) => n.type);
+    expect(types).toContain("heading");
+  });
+
  it("inserts JSON clipboard text without running the Markdown parser", () => {
    editor = makeEditor({
      type: "doc",
@@ -178,6 +268,83 @@ describe("markdownPaste — code block context", () => {
    expectLiteralPaste(editor, text);
  });

+  it("preserves single unknown HTML-like tag (e.g. <T>)", () => {
+    editor = makeEditor({
+      type: "doc",
+      content: [{ type: "paragraph" }],
+    });
+
+    editor.commands.setTextSelection(1);
+
+    const handled = paste(editor, "<T>");
+    expect(handled).toBe(true);
+    expect(editor.getText()).toBe("<T>");
+  });
+
+  it("preserves any unknown HTML-like tag (e.g. <MyComponent>)", () => {
+    editor = makeEditor({
+      type: "doc",
+      content: [{ type: "paragraph" }],
+    });
+
+    editor.commands.setTextSelection(1);
+
+    const handled = paste(editor, "<MyComponent>");
+    expect(handled).toBe(true);
+    expect(editor.getText()).toBe("<MyComponent>");
+  });
+
+  it("preserves unknown tags in mixed multi-line content", () => {
+    editor = makeEditor({
+      type: "doc",
+      content: [{ type: "paragraph" }],
+    });
+
+    editor.commands.setTextSelection(1);
+
+    const text = "<t>\n\n裸 `<tag>` 做转\n\n<tag>\n\n<t>";
+    const handled = paste(editor, text);
+    expect(handled).toBe(true);
+
+    const editorText = editor.getText();
+    expect(editorText).toContain("<t>");
+    expect(editorText).toContain("<tag>");
+    expect(editorText).toContain("裸");
+    expect(editorText).toContain("做转");
+  });
+
+  it("preserves HTML-like tags embedded in regular text", () => {
+    editor = makeEditor({
+      type: "doc",
+      content: [{ type: "paragraph" }],
+    });
+
+    editor.commands.setTextSelection(1);
+
+    const text = "foo <bar> baz";
+    const handled = paste(editor, text);
+    expect(handled).toBe(true);
+    expect(editor.getText()).toBe(text);
+    expect(editor.getMarkdown()).toBe("foo &lt;bar&gt; baz");
+  });
+
+  it("preserves standard HTML element names as literal pasted text", () => {
+    editor = makeEditor({
+      type: "doc",
+      content: [{ type: "paragraph" }],
+    });
+
+    editor.commands.setTextSelection(1);
+
+    const text = 'foo <button> <img src="x"> baz';
+    const handled = paste(editor, text);
+    expect(handled).toBe(true);
+    expect(editor.getText()).toBe(text);
+    expect(editor.getMarkdown()).toBe(
+      'foo &lt;button&gt; &lt;img src="x"&gt; baz',
+    );
+  });
+
  it("does not parse oversized bracketed plain text as JSON", () => {
    editor = makeEditor({
      type: "doc",
@@ -192,3 +359,62 @@ describe("markdownPaste — code block context", () => {
    expect(parseJsonSpy).not.toHaveBeenCalled();
  });
 });
+
+describe("escapeRawHtmlTagsOutsideCode", () => {
+  it("escapes HTML-like tags", () => {
+    expect(escapeRawHtmlTagsOutsideCode("<T>")).toBe("&lt;T&gt;");
+    expect(escapeRawHtmlTagsOutsideCode("<tag>")).toBe("&lt;tag&gt;");
+    expect(escapeRawHtmlTagsOutsideCode("<MyComponent>")).toBe(
+      "&lt;MyComponent&gt;",
+    );
+    expect(escapeRawHtmlTagsOutsideCode("</tag>")).toBe("&lt;/tag&gt;");
+  });
+
+  it("escapes standard HTML element names too", () => {
+    expect(escapeRawHtmlTagsOutsideCode("<div>")).toBe("&lt;div&gt;");
+    expect(escapeRawHtmlTagsOutsideCode("<br>")).toBe("&lt;br&gt;");
+    expect(escapeRawHtmlTagsOutsideCode("</div>")).toBe("&lt;/div&gt;");
+    expect(escapeRawHtmlTagsOutsideCode('<img src="x">')).toBe(
+      '&lt;img src="x"&gt;',
+    );
+  });
+
+  it("does not escape inside inline code spans", () => {
+    expect(escapeRawHtmlTagsOutsideCode("`<tag>`")).toBe("`<tag>`");
+    expect(escapeRawHtmlTagsOutsideCode("text `<T>` more")).toBe(
+      "text `<T>` more",
+    );
+    expect(escapeRawHtmlTagsOutsideCode("``<tag>``")).toBe("``<tag>``");
+  });
+
+  it("does not escape inside fenced code blocks", () => {
+    expect(escapeRawHtmlTagsOutsideCode("```\n<T>\n```")).toBe(
+      "```\n<T>\n```",
+    );
+    expect(escapeRawHtmlTagsOutsideCode("~~~\n<tag>\n~~~")).toBe(
+      "~~~\n<tag>\n~~~",
+    );
+    expect(escapeRawHtmlTagsOutsideCode("   ```\n<T>\n   ```")).toBe(
+      "   ```\n<T>\n   ```",
+    );
+  });
+
+  it("escapes all tag-like runs in mixed content", () => {
+    expect(escapeRawHtmlTagsOutsideCode("<T> and <div>")).toBe(
+      "&lt;T&gt; and &lt;div&gt;",
+    );
+  });
+
+  it("handles multi-line mixed content", () => {
+    const input = "<t>\n\n裸 `<tag>` 做转\n\n<tag>\n\n<t>";
+    const result = escapeRawHtmlTagsOutsideCode(input);
+    expect(result).toBe(
+      "&lt;t&gt;\n\n裸 `<tag>` 做转\n\n&lt;tag&gt;\n\n&lt;t&gt;",
+    );
+  });
+
+  it("does not touch math expressions", () => {
+    expect(escapeRawHtmlTagsOutsideCode("1 < 2 > 0")).toBe("1 < 2 > 0");
+    expect(escapeRawHtmlTagsOutsideCode("x<y")).toBe("x<y");
+  });
+});
--- a/packages/views/editor/extensions/markdown-paste.ts
+++ b/packages/views/editor/extensions/markdown-paste.ts
@@ -20,17 +20,221 @@
 * Why not clipboardTextParser? It only runs when there's NO text/html on
 * the clipboard (ProseMirror source: `let asText = !!text && !html`).
 *
- * Why not heuristic detection (looksLikeMarkdown / hasRichHtml)? Unreliable.
- * VS Code's HTML contains <code> tags that fool rich-content detectors.
- * Markdown pattern matching has too many edge cases. Instead, the classifier
- * only keeps narrow deterministic exits for editor-owned slices, code block
- * context, structured plain text, and large payloads.
+ * HTML/text classification is intentionally conservative. Rich semantic HTML
+ * should stay native so links, lists, emphasis, and inline code survive.
+ * Syntax-highlight wrappers from editors (<pre>/<code>/<span>/<div>) are not
+ * enough by themselves, because those should still paste as Markdown source.
 */
 import { Extension } from "@tiptap/core";
 import { Plugin, PluginKey } from "@tiptap/pm/state";
 import { Slice } from "@tiptap/pm/model";

 const LARGE_PASTE_TEXT_THRESHOLD = 50_000;
+const SEMANTIC_RICH_HTML_SELECTOR = [
+  "a[href]",
+  "b",
+  "blockquote",
+  "del",
+  "details",
+  "em",
+  "figcaption",
+  "figure",
+  "h1",
+  "h2",
+  "h3",
+  "h4",
+  "h5",
+  "h6",
+  "hr",
+  "i",
+  "img",
+  "li",
+  "mark",
+  "ol",
+  "s",
+  "strong",
+  "sub",
+  "summary",
+  "sup",
+  "table",
+  "tbody",
+  "td",
+  "tfoot",
+  "th",
+  "thead",
+  "tr",
+  "u",
+  "ul",
+].join(",");
+const RAW_HTML_TAG_RE = /<(\/?[a-zA-Z][a-zA-Z0-9-]*)(?:\s[^>]*)?\/?>/g;
+
+// CommonMark treats <word> as raw HTML regardless of whether "word" is a real
+// HTML element. For plain-text paste, the user's text is the source of truth, so
+// escape tag-like runs before the Markdown lexer can classify them as HTML.
+function escapeRawHtmlTagsInSegment(segment: string): string {
+  return segment.replace(
+    RAW_HTML_TAG_RE,
+    (match) => match.replaceAll("<", "&lt;").replaceAll(">", "&gt;"),
+  );
+}
+
+function collectRawHtmlTagsInSegment(segment: string): string[] {
+  return segment.match(RAW_HTML_TAG_RE) ?? [];
+}
+
+function escapeTagsOutsideCodeSpans(line: string): string {
+  const parts: string[] = [];
+  let i = 0;
+
+  while (i < line.length) {
+    if (line[i] === "`") {
+      let count = 0;
+      while (i + count < line.length && line[i + count] === "`") count++;
+      const delimiter = "`".repeat(count);
+      const afterOpener = i + count;
+
+      let closerIdx = afterOpener;
+      let found = false;
+      while (closerIdx <= line.length - count) {
+        const idx = line.indexOf(delimiter, closerIdx);
+        if (idx === -1) break;
+        if (
+          (idx + count >= line.length || line[idx + count] !== "`") &&
+          (idx === 0 || line[idx - 1] !== "`")
+        ) {
+          parts.push(line.slice(i, idx + count));
+          i = idx + count;
+          found = true;
+          break;
+        }
+        closerIdx = idx + 1;
+      }
+
+      if (!found) {
+        parts.push(escapeRawHtmlTagsInSegment(delimiter));
+        i = afterOpener;
+      }
+      continue;
+    }
+
+    const nextBacktick = line.indexOf("`", i);
+    const end = nextBacktick === -1 ? line.length : nextBacktick;
+    parts.push(escapeRawHtmlTagsInSegment(line.slice(i, end)));
+    i = end;
+  }
+
+  return parts.join("");
+}
+
+function collectTagsOutsideCodeSpans(line: string): string[] {
+  const tags: string[] = [];
+  let i = 0;
+
+  while (i < line.length) {
+    if (line[i] === "`") {
+      let count = 0;
+      while (i + count < line.length && line[i + count] === "`") count++;
+      const delimiter = "`".repeat(count);
+      const afterOpener = i + count;
+
+      let closerIdx = afterOpener;
+      let found = false;
+      while (closerIdx <= line.length - count) {
+        const idx = line.indexOf(delimiter, closerIdx);
+        if (idx === -1) break;
+        if (
+          (idx + count >= line.length || line[idx + count] !== "`") &&
+          (idx === 0 || line[idx - 1] !== "`")
+        ) {
+          i = idx + count;
+          found = true;
+          break;
+        }
+        closerIdx = idx + 1;
+      }
+
+      if (!found) {
+        i = afterOpener;
+      }
+      continue;
+    }
+
+    const nextBacktick = line.indexOf("`", i);
+    const end = nextBacktick === -1 ? line.length : nextBacktick;
+    tags.push(...collectRawHtmlTagsInSegment(line.slice(i, end)));
+    i = end;
+  }
+
+  return tags;
+}
+
+export function escapeRawHtmlTagsOutsideCode(text: string): string {
+  const lines = text.split("\n");
+  let inFencedBlock = false;
+  let fenceChar = "";
+  let fenceLen = 0;
+
+  const processed = lines.map((line) => {
+    const fenceMatch = line.match(/^ {0,3}(`{3,}|~{3,})/);
+    const fence = fenceMatch?.[1];
+    if (fence) {
+      if (!inFencedBlock) {
+        inFencedBlock = true;
+        fenceChar = fence.charAt(0);
+        fenceLen = fence.length;
+        return line;
+      }
+      const isClosingFence =
+        fence.charAt(0) === fenceChar &&
+        fence.length >= fenceLen &&
+        /^ {0,3}(`{3,}|~{3,})[ \t]*$/.test(line);
+      if (isClosingFence) {
+        inFencedBlock = false;
+        return line;
+      }
+    }
+
+    if (inFencedBlock) return line;
+    return escapeTagsOutsideCodeSpans(line);
+  });
+
+  return processed.join("\n");
+}
+
+function findRawHtmlTagsOutsideCode(text: string): string[] {
+  const lines = text.split("\n");
+  const tags: string[] = [];
+  let inFencedBlock = false;
+  let fenceChar = "";
+  let fenceLen = 0;
+
+  for (const line of lines) {
+    const fenceMatch = line.match(/^ {0,3}(`{3,}|~{3,})/);
+    const fence = fenceMatch?.[1];
+    if (fence) {
+      if (!inFencedBlock) {
+        inFencedBlock = true;
+        fenceChar = fence.charAt(0);
+        fenceLen = fence.length;
+        continue;
+      }
+      const isClosingFence =
+        fence.charAt(0) === fenceChar &&
+        fence.length >= fenceLen &&
+        /^ {0,3}(`{3,}|~{3,})[ \t]*$/.test(line);
+      if (isClosingFence) {
+        inFencedBlock = false;
+        continue;
+      }
+    }
+
+    if (!inFencedBlock) {
+      tags.push(...collectTagsOutsideCodeSpans(line));
+    }
+  }
+
+  return tags;
+}

 type PasteMode = "native" | "literal" | "markdown";

@@ -62,6 +266,71 @@ function isStructuredPlainText(text: string): boolean {
  return isJsonDocumentText(text);
 }

+function hasRichStyle(style: string): boolean {
+  const normalized = style.toLowerCase();
+  return (
+    /font-weight\s*:\s*(bold|[6-9]00)\b/.test(normalized) ||
+    /font-style\s*:\s*italic\b/.test(normalized) ||
+    /text-decoration[^;]*(line-through|underline)/.test(normalized)
+  );
+}
+
+function countOccurrences(text: string, needle: string): number {
+  if (!needle) return 0;
+  let count = 0;
+  let index = text.indexOf(needle);
+  while (index !== -1) {
+    count++;
+    index = text.indexOf(needle, index + needle.length);
+  }
+  return count;
+}
+
+function htmlPreservesRawTagsFromPlainText(html: string, text: string): boolean {
+  const tags = findRawHtmlTagsOutsideCode(text);
+  if (tags.length === 0) return true;
+  if (typeof DOMParser === "undefined") return false;
+
+  const doc = new DOMParser().parseFromString(html, "text/html");
+  const htmlText = doc.body?.textContent ?? "";
+  const expectedCounts = new Map<string, number>();
+  for (const tag of tags) {
+    expectedCounts.set(tag, (expectedCounts.get(tag) ?? 0) + 1);
+  }
+
+  for (const [tag, expectedCount] of expectedCounts) {
+    if (countOccurrences(htmlText, tag) < expectedCount) return false;
+  }
+
+  return true;
+}
+
+function hasSemanticRichHtml(html: string, text: string): boolean {
+  if (!html.trim()) return false;
+  if (typeof DOMParser === "undefined") return false;
+
+  if (!htmlPreservesRawTagsFromPlainText(html, text)) return false;
+
+  const doc = new DOMParser().parseFromString(html, "text/html");
+  const { body } = doc;
+  if (!body) return false;
+
+  if (body.querySelector(SEMANTIC_RICH_HTML_SELECTOR)) return true;
+
+  // Inline <code> carries meaningful rich-text semantics. A <pre><code> pair
+  // alone is often just a syntax-highlight wrapper from editors, so keep that
+  // path available for Markdown parsing.
+  for (const code of Array.from(body.querySelectorAll("code"))) {
+    if (!code.closest("pre")) return true;
+  }
+
+  for (const el of Array.from(body.querySelectorAll<HTMLElement>("[style]"))) {
+    if (hasRichStyle(el.getAttribute("style") ?? "")) return true;
+  }
+
+  return false;
+}
+
 function classifyPaste({
  text,
  html,
@@ -72,6 +341,7 @@ function classifyPaste({
  if (!text) return "native";
  if (isInsideCodeBlock) return "literal";
  if (html && html.includes("data-pm-slice")) return "native";
+  if (html && hasSemanticRichHtml(html, text)) return "native";
  if (text.length > LARGE_PASTE_TEXT_THRESHOLD) return "literal";
  if (isStructuredPlainText(text)) return "literal";
  return "markdown";
@@ -110,8 +380,23 @@ export function createMarkdownPasteExtension() {

              // Everything else (VS Code, text editors, .md files, terminals,
              // web pages): parse text/plain as Markdown.
-              const json = editor.markdown.parse(text);
+              const preprocessed = escapeRawHtmlTagsOutsideCode(text);
+              const json = editor.markdown.parse(preprocessed);
              const node = editor.schema.nodeFromJSON(json);
+
+              // Safety net: if parsing still produces an empty doc despite
+              // non-empty input, fall back to literal insertion.
+              const first = node.content.firstChild;
+              const parsedEmpty =
+                node.content.childCount === 0 ||
+                (node.content.childCount === 1 &&
+                  first?.type.name === "paragraph" &&
+                  first.content.size === 0);
+              if (text.trim() && parsedEmpty) {
+                view.dispatch(view.state.tr.insertText(text));
+                return true;
+              }
+
              const slice = Slice.maxOpen(node.content);
              const tr = view.state.tr.replaceSelection(slice);
              view.dispatch(tr);
Author	SHA1	Message	Date
Naiyuan Qing	d7f5e11122	fix(editor): avoid native paste when html drops raw tags	2026-05-27 16:59:02 +08:00
Naiyuan Qing	d031177edc	fix(editor): prefer rich html paste when semantic	2026-05-27 16:52:54 +08:00
Naiyuan Qing	28414ee110	fix(editor): preserve raw html-like text on paste	2026-05-27 16:37:44 +08:00
Naiyuan Qing	aec279b5d8	fix(editor): escape non-standard HTML tags in paste to prevent content loss When pasting mixed content containing multiple <tag> patterns (e.g. "<t>\n裸 `<tag>` 做转\n<tag>\n<t>"), CommonMark treats bare <word> as inline HTML. ProseMirror silently drops unknown HTML elements, causing partial content loss. The previous empty-result fallback only caught the single-tag case where the entire parse result was empty. Pre-process paste text before markdown parsing: escape <tag> patterns whose tag name is not a standard HTML element, while respecting inline code spans and fenced code blocks. Standard HTML (div, br, img, etc.) passes through normally. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> Co-authored-by: multica-agent <github@multica.ai>	2026-05-27 16:03:53 +08:00
Naiyuan Qing	807e9fdabb	fix(editor): fall back to literal paste when markdown parser drops all content When pasting text like `<T>` or `<MyComponent>`, the CommonMark-compliant markdown parser treats them as inline HTML tags. ProseMirror's schema doesn't recognize unknown HTML elements, so they are silently dropped — producing an empty document from non-empty input. Detect this case (non-empty input → empty parse result) and fall back to literal text insertion so the user sees their text instead of nothing. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> Co-authored-by: multica-agent <github@multica.ai>	2026-05-27 15:28:10 +08:00