Compare commits

...

5 Commits

Author SHA1 Message Date
Naiyuan Qing
d7f5e11122 fix(editor): avoid native paste when html drops raw tags 2026-05-27 16:59:02 +08:00
Naiyuan Qing
d031177edc fix(editor): prefer rich html paste when semantic 2026-05-27 16:52:54 +08:00
Naiyuan Qing
28414ee110 fix(editor): preserve raw html-like text on paste 2026-05-27 16:37:44 +08:00
Naiyuan Qing
aec279b5d8 fix(editor): escape non-standard HTML tags in paste to prevent content loss
When pasting mixed content containing multiple <tag> patterns (e.g.
"<t>\n裸 `<tag>` 做转\n<tag>\n<t>"), CommonMark treats bare <word>
as inline HTML. ProseMirror silently drops unknown HTML elements,
causing partial content loss. The previous empty-result fallback only
caught the single-tag case where the entire parse result was empty.

Pre-process paste text before markdown parsing: escape <tag> patterns
whose tag name is not a standard HTML element, while respecting inline
code spans and fenced code blocks. Standard HTML (div, br, img, etc.)
passes through normally.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
Co-authored-by: multica-agent <github@multica.ai>
2026-05-27 16:03:53 +08:00
Naiyuan Qing
807e9fdabb fix(editor): fall back to literal paste when markdown parser drops all content
When pasting text like `<T>` or `<MyComponent>`, the CommonMark-compliant
markdown parser treats them as inline HTML tags. ProseMirror's schema doesn't
recognize unknown HTML elements, so they are silently dropped — producing an
empty document from non-empty input.

Detect this case (non-empty input → empty parse result) and fall back to
literal text insertion so the user sees their text instead of nothing.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
Co-authored-by: multica-agent <github@multica.ai>
2026-05-27 15:28:10 +08:00
2 changed files with 518 additions and 7 deletions

View File

@@ -2,7 +2,10 @@ import { describe, it, expect, afterEach, vi } from "vitest";
import { Editor } from "@tiptap/core";
import StarterKit from "@tiptap/starter-kit";
import { Markdown } from "@tiptap/markdown";
import { createMarkdownPasteExtension } from "./markdown-paste";
import {
createMarkdownPasteExtension,
escapeRawHtmlTagsOutsideCode,
} from "./markdown-paste";
interface FakeClipboard {
files: never[];
@@ -141,6 +144,93 @@ describe("markdownPaste — code block context", () => {
expect(types).toContain("heading");
});
it("lets semantic rich HTML paste natively instead of reparsing plain text as Markdown", () => {
editor = makeEditor({
type: "doc",
content: [{ type: "paragraph" }],
});
editor.commands.setTextSelection(1);
const parseSpy = vi.spyOn(editor.markdown!, "parse");
const text =
"viewFiltersToApiParams(filters) maps to Partial<ListIssuesParams>.";
const html =
"<p><code>viewFiltersToApiParams(filters)</code> maps to " +
"<code>Partial&lt;ListIssuesParams&gt;</code>.</p>";
const handled = paste(editor, text, html);
expect(handled).toBe(false);
expect(parseSpy).not.toHaveBeenCalled();
});
it("lets list and emphasis HTML paste natively", () => {
editor = makeEditor({
type: "doc",
content: [{ type: "paragraph" }],
});
editor.commands.setTextSelection(1);
const parseSpy = vi.spyOn(editor.markdown!, "parse");
const handled = paste(
editor,
"Done\nCreated filters.ts",
"<ul><li><strong>Done</strong></li><li>Created filters.ts</li></ul>",
);
expect(handled).toBe(false);
expect(parseSpy).not.toHaveBeenCalled();
});
it("does not paste rich HTML natively when its text would drop raw tag-like lines", () => {
editor = makeEditor({
type: "doc",
content: [{ type: "paragraph" }],
});
editor.commands.setTextSelection(1);
const text =
"<t>\n\n裸 `<tag>` 做转\n\n<tag>\n\n" +
"<t>\n\n裸 `<tag>` 做转\n\n<tag>";
const html =
"<div><t></t></div>" +
"<p>裸 <code>&lt;tag&gt;</code> 做转</p>" +
"<div><tag></tag></div>" +
"<div><t></t></div>" +
"<p>裸 <code>&lt;tag&gt;</code> 做转</p>" +
"<div><tag></tag></div>";
const handled = paste(editor, text, html);
expect(handled).toBe(true);
const editorText = editor.getText();
expect(editorText.match(/<t>/g)).toHaveLength(2);
expect(editorText.match(/<tag>/g)).toHaveLength(4);
expect(editorText.match(/裸/g)).toHaveLength(2);
});
it("still parses Markdown when HTML is only a syntax-highlight wrapper", () => {
editor = makeEditor({
type: "doc",
content: [{ type: "paragraph" }],
});
editor.commands.setTextSelection(1);
const handled = paste(
editor,
"# Heading\n\nbody",
'<pre><code><span style="color: #888"># Heading</span><br><br>body</code></pre>',
);
expect(handled).toBe(true);
const json = editor.getJSON() as JsonNode;
const types = (json.content ?? []).map((n) => n.type);
expect(types).toContain("heading");
});
it("inserts JSON clipboard text without running the Markdown parser", () => {
editor = makeEditor({
type: "doc",
@@ -178,6 +268,83 @@ describe("markdownPaste — code block context", () => {
expectLiteralPaste(editor, text);
});
it("preserves single unknown HTML-like tag (e.g. <T>)", () => {
editor = makeEditor({
type: "doc",
content: [{ type: "paragraph" }],
});
editor.commands.setTextSelection(1);
const handled = paste(editor, "<T>");
expect(handled).toBe(true);
expect(editor.getText()).toBe("<T>");
});
it("preserves any unknown HTML-like tag (e.g. <MyComponent>)", () => {
editor = makeEditor({
type: "doc",
content: [{ type: "paragraph" }],
});
editor.commands.setTextSelection(1);
const handled = paste(editor, "<MyComponent>");
expect(handled).toBe(true);
expect(editor.getText()).toBe("<MyComponent>");
});
it("preserves unknown tags in mixed multi-line content", () => {
editor = makeEditor({
type: "doc",
content: [{ type: "paragraph" }],
});
editor.commands.setTextSelection(1);
const text = "<t>\n\n裸 `<tag>` 做转\n\n<tag>\n\n<t>";
const handled = paste(editor, text);
expect(handled).toBe(true);
const editorText = editor.getText();
expect(editorText).toContain("<t>");
expect(editorText).toContain("<tag>");
expect(editorText).toContain("裸");
expect(editorText).toContain("做转");
});
it("preserves HTML-like tags embedded in regular text", () => {
editor = makeEditor({
type: "doc",
content: [{ type: "paragraph" }],
});
editor.commands.setTextSelection(1);
const text = "foo <bar> baz";
const handled = paste(editor, text);
expect(handled).toBe(true);
expect(editor.getText()).toBe(text);
expect(editor.getMarkdown()).toBe("foo &lt;bar&gt; baz");
});
it("preserves standard HTML element names as literal pasted text", () => {
editor = makeEditor({
type: "doc",
content: [{ type: "paragraph" }],
});
editor.commands.setTextSelection(1);
const text = 'foo <button> <img src="x"> baz';
const handled = paste(editor, text);
expect(handled).toBe(true);
expect(editor.getText()).toBe(text);
expect(editor.getMarkdown()).toBe(
'foo &lt;button&gt; &lt;img src="x"&gt; baz',
);
});
it("does not parse oversized bracketed plain text as JSON", () => {
editor = makeEditor({
type: "doc",
@@ -192,3 +359,62 @@ describe("markdownPaste — code block context", () => {
expect(parseJsonSpy).not.toHaveBeenCalled();
});
});
describe("escapeRawHtmlTagsOutsideCode", () => {
it("escapes HTML-like tags", () => {
expect(escapeRawHtmlTagsOutsideCode("<T>")).toBe("&lt;T&gt;");
expect(escapeRawHtmlTagsOutsideCode("<tag>")).toBe("&lt;tag&gt;");
expect(escapeRawHtmlTagsOutsideCode("<MyComponent>")).toBe(
"&lt;MyComponent&gt;",
);
expect(escapeRawHtmlTagsOutsideCode("</tag>")).toBe("&lt;/tag&gt;");
});
it("escapes standard HTML element names too", () => {
expect(escapeRawHtmlTagsOutsideCode("<div>")).toBe("&lt;div&gt;");
expect(escapeRawHtmlTagsOutsideCode("<br>")).toBe("&lt;br&gt;");
expect(escapeRawHtmlTagsOutsideCode("</div>")).toBe("&lt;/div&gt;");
expect(escapeRawHtmlTagsOutsideCode('<img src="x">')).toBe(
'&lt;img src="x"&gt;',
);
});
it("does not escape inside inline code spans", () => {
expect(escapeRawHtmlTagsOutsideCode("`<tag>`")).toBe("`<tag>`");
expect(escapeRawHtmlTagsOutsideCode("text `<T>` more")).toBe(
"text `<T>` more",
);
expect(escapeRawHtmlTagsOutsideCode("``<tag>``")).toBe("``<tag>``");
});
it("does not escape inside fenced code blocks", () => {
expect(escapeRawHtmlTagsOutsideCode("```\n<T>\n```")).toBe(
"```\n<T>\n```",
);
expect(escapeRawHtmlTagsOutsideCode("~~~\n<tag>\n~~~")).toBe(
"~~~\n<tag>\n~~~",
);
expect(escapeRawHtmlTagsOutsideCode(" ```\n<T>\n ```")).toBe(
" ```\n<T>\n ```",
);
});
it("escapes all tag-like runs in mixed content", () => {
expect(escapeRawHtmlTagsOutsideCode("<T> and <div>")).toBe(
"&lt;T&gt; and &lt;div&gt;",
);
});
it("handles multi-line mixed content", () => {
const input = "<t>\n\n裸 `<tag>` 做转\n\n<tag>\n\n<t>";
const result = escapeRawHtmlTagsOutsideCode(input);
expect(result).toBe(
"&lt;t&gt;\n\n裸 `<tag>` 做转\n\n&lt;tag&gt;\n\n&lt;t&gt;",
);
});
it("does not touch math expressions", () => {
expect(escapeRawHtmlTagsOutsideCode("1 < 2 > 0")).toBe("1 < 2 > 0");
expect(escapeRawHtmlTagsOutsideCode("x<y")).toBe("x<y");
});
});

View File

@@ -20,17 +20,221 @@
* Why not clipboardTextParser? It only runs when there's NO text/html on
* the clipboard (ProseMirror source: `let asText = !!text && !html`).
*
* Why not heuristic detection (looksLikeMarkdown / hasRichHtml)? Unreliable.
* VS Code's HTML contains <code> tags that fool rich-content detectors.
* Markdown pattern matching has too many edge cases. Instead, the classifier
* only keeps narrow deterministic exits for editor-owned slices, code block
* context, structured plain text, and large payloads.
* HTML/text classification is intentionally conservative. Rich semantic HTML
* should stay native so links, lists, emphasis, and inline code survive.
* Syntax-highlight wrappers from editors (<pre>/<code>/<span>/<div>) are not
* enough by themselves, because those should still paste as Markdown source.
*/
import { Extension } from "@tiptap/core";
import { Plugin, PluginKey } from "@tiptap/pm/state";
import { Slice } from "@tiptap/pm/model";
const LARGE_PASTE_TEXT_THRESHOLD = 50_000;
const SEMANTIC_RICH_HTML_SELECTOR = [
"a[href]",
"b",
"blockquote",
"del",
"details",
"em",
"figcaption",
"figure",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"hr",
"i",
"img",
"li",
"mark",
"ol",
"s",
"strong",
"sub",
"summary",
"sup",
"table",
"tbody",
"td",
"tfoot",
"th",
"thead",
"tr",
"u",
"ul",
].join(",");
const RAW_HTML_TAG_RE = /<(\/?[a-zA-Z][a-zA-Z0-9-]*)(?:\s[^>]*)?\/?>/g;
// CommonMark treats <word> as raw HTML regardless of whether "word" is a real
// HTML element. For plain-text paste, the user's text is the source of truth, so
// escape tag-like runs before the Markdown lexer can classify them as HTML.
function escapeRawHtmlTagsInSegment(segment: string): string {
return segment.replace(
RAW_HTML_TAG_RE,
(match) => match.replaceAll("<", "&lt;").replaceAll(">", "&gt;"),
);
}
function collectRawHtmlTagsInSegment(segment: string): string[] {
return segment.match(RAW_HTML_TAG_RE) ?? [];
}
function escapeTagsOutsideCodeSpans(line: string): string {
const parts: string[] = [];
let i = 0;
while (i < line.length) {
if (line[i] === "`") {
let count = 0;
while (i + count < line.length && line[i + count] === "`") count++;
const delimiter = "`".repeat(count);
const afterOpener = i + count;
let closerIdx = afterOpener;
let found = false;
while (closerIdx <= line.length - count) {
const idx = line.indexOf(delimiter, closerIdx);
if (idx === -1) break;
if (
(idx + count >= line.length || line[idx + count] !== "`") &&
(idx === 0 || line[idx - 1] !== "`")
) {
parts.push(line.slice(i, idx + count));
i = idx + count;
found = true;
break;
}
closerIdx = idx + 1;
}
if (!found) {
parts.push(escapeRawHtmlTagsInSegment(delimiter));
i = afterOpener;
}
continue;
}
const nextBacktick = line.indexOf("`", i);
const end = nextBacktick === -1 ? line.length : nextBacktick;
parts.push(escapeRawHtmlTagsInSegment(line.slice(i, end)));
i = end;
}
return parts.join("");
}
function collectTagsOutsideCodeSpans(line: string): string[] {
const tags: string[] = [];
let i = 0;
while (i < line.length) {
if (line[i] === "`") {
let count = 0;
while (i + count < line.length && line[i + count] === "`") count++;
const delimiter = "`".repeat(count);
const afterOpener = i + count;
let closerIdx = afterOpener;
let found = false;
while (closerIdx <= line.length - count) {
const idx = line.indexOf(delimiter, closerIdx);
if (idx === -1) break;
if (
(idx + count >= line.length || line[idx + count] !== "`") &&
(idx === 0 || line[idx - 1] !== "`")
) {
i = idx + count;
found = true;
break;
}
closerIdx = idx + 1;
}
if (!found) {
i = afterOpener;
}
continue;
}
const nextBacktick = line.indexOf("`", i);
const end = nextBacktick === -1 ? line.length : nextBacktick;
tags.push(...collectRawHtmlTagsInSegment(line.slice(i, end)));
i = end;
}
return tags;
}
export function escapeRawHtmlTagsOutsideCode(text: string): string {
const lines = text.split("\n");
let inFencedBlock = false;
let fenceChar = "";
let fenceLen = 0;
const processed = lines.map((line) => {
const fenceMatch = line.match(/^ {0,3}(`{3,}|~{3,})/);
const fence = fenceMatch?.[1];
if (fence) {
if (!inFencedBlock) {
inFencedBlock = true;
fenceChar = fence.charAt(0);
fenceLen = fence.length;
return line;
}
const isClosingFence =
fence.charAt(0) === fenceChar &&
fence.length >= fenceLen &&
/^ {0,3}(`{3,}|~{3,})[ \t]*$/.test(line);
if (isClosingFence) {
inFencedBlock = false;
return line;
}
}
if (inFencedBlock) return line;
return escapeTagsOutsideCodeSpans(line);
});
return processed.join("\n");
}
function findRawHtmlTagsOutsideCode(text: string): string[] {
const lines = text.split("\n");
const tags: string[] = [];
let inFencedBlock = false;
let fenceChar = "";
let fenceLen = 0;
for (const line of lines) {
const fenceMatch = line.match(/^ {0,3}(`{3,}|~{3,})/);
const fence = fenceMatch?.[1];
if (fence) {
if (!inFencedBlock) {
inFencedBlock = true;
fenceChar = fence.charAt(0);
fenceLen = fence.length;
continue;
}
const isClosingFence =
fence.charAt(0) === fenceChar &&
fence.length >= fenceLen &&
/^ {0,3}(`{3,}|~{3,})[ \t]*$/.test(line);
if (isClosingFence) {
inFencedBlock = false;
continue;
}
}
if (!inFencedBlock) {
tags.push(...collectTagsOutsideCodeSpans(line));
}
}
return tags;
}
type PasteMode = "native" | "literal" | "markdown";
@@ -62,6 +266,71 @@ function isStructuredPlainText(text: string): boolean {
return isJsonDocumentText(text);
}
function hasRichStyle(style: string): boolean {
const normalized = style.toLowerCase();
return (
/font-weight\s*:\s*(bold|[6-9]00)\b/.test(normalized) ||
/font-style\s*:\s*italic\b/.test(normalized) ||
/text-decoration[^;]*(line-through|underline)/.test(normalized)
);
}
function countOccurrences(text: string, needle: string): number {
if (!needle) return 0;
let count = 0;
let index = text.indexOf(needle);
while (index !== -1) {
count++;
index = text.indexOf(needle, index + needle.length);
}
return count;
}
function htmlPreservesRawTagsFromPlainText(html: string, text: string): boolean {
const tags = findRawHtmlTagsOutsideCode(text);
if (tags.length === 0) return true;
if (typeof DOMParser === "undefined") return false;
const doc = new DOMParser().parseFromString(html, "text/html");
const htmlText = doc.body?.textContent ?? "";
const expectedCounts = new Map<string, number>();
for (const tag of tags) {
expectedCounts.set(tag, (expectedCounts.get(tag) ?? 0) + 1);
}
for (const [tag, expectedCount] of expectedCounts) {
if (countOccurrences(htmlText, tag) < expectedCount) return false;
}
return true;
}
function hasSemanticRichHtml(html: string, text: string): boolean {
if (!html.trim()) return false;
if (typeof DOMParser === "undefined") return false;
if (!htmlPreservesRawTagsFromPlainText(html, text)) return false;
const doc = new DOMParser().parseFromString(html, "text/html");
const { body } = doc;
if (!body) return false;
if (body.querySelector(SEMANTIC_RICH_HTML_SELECTOR)) return true;
// Inline <code> carries meaningful rich-text semantics. A <pre><code> pair
// alone is often just a syntax-highlight wrapper from editors, so keep that
// path available for Markdown parsing.
for (const code of Array.from(body.querySelectorAll("code"))) {
if (!code.closest("pre")) return true;
}
for (const el of Array.from(body.querySelectorAll<HTMLElement>("[style]"))) {
if (hasRichStyle(el.getAttribute("style") ?? "")) return true;
}
return false;
}
function classifyPaste({
text,
html,
@@ -72,6 +341,7 @@ function classifyPaste({
if (!text) return "native";
if (isInsideCodeBlock) return "literal";
if (html && html.includes("data-pm-slice")) return "native";
if (html && hasSemanticRichHtml(html, text)) return "native";
if (text.length > LARGE_PASTE_TEXT_THRESHOLD) return "literal";
if (isStructuredPlainText(text)) return "literal";
return "markdown";
@@ -110,8 +380,23 @@ export function createMarkdownPasteExtension() {
// Everything else (VS Code, text editors, .md files, terminals,
// web pages): parse text/plain as Markdown.
const json = editor.markdown.parse(text);
const preprocessed = escapeRawHtmlTagsOutsideCode(text);
const json = editor.markdown.parse(preprocessed);
const node = editor.schema.nodeFromJSON(json);
// Safety net: if parsing still produces an empty doc despite
// non-empty input, fall back to literal insertion.
const first = node.content.firstChild;
const parsedEmpty =
node.content.childCount === 0 ||
(node.content.childCount === 1 &&
first?.type.name === "paragraph" &&
first.content.size === 0);
if (text.trim() && parsedEmpty) {
view.dispatch(view.state.tr.insertText(text));
return true;
}
const slice = Slice.maxOpen(node.content);
const tr = view.state.tr.replaceSelection(slice);
view.dispatch(tr);