Files
multica/packages/views/editor/extensions/markdown-paste.ts
Naiyuan Qing 171ee842d4 fix(editor): preserve raw html-like text on paste (#3355)
* fix(editor): fall back to literal paste when markdown parser drops all content

When pasting text like `<T>` or `<MyComponent>`, the CommonMark-compliant
markdown parser treats them as inline HTML tags. ProseMirror's schema doesn't
recognize unknown HTML elements, so they are silently dropped — producing an
empty document from non-empty input.

Detect this case (non-empty input → empty parse result) and fall back to
literal text insertion so the user sees their text instead of nothing.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
Co-authored-by: multica-agent <github@multica.ai>

* fix(editor): escape non-standard HTML tags in paste to prevent content loss

When pasting mixed content containing multiple <tag> patterns (e.g.
"<t>\n裸 `<tag>` 做转\n<tag>\n<t>"), CommonMark treats bare <word>
as inline HTML. ProseMirror silently drops unknown HTML elements,
causing partial content loss. The previous empty-result fallback only
caught the single-tag case where the entire parse result was empty.

Pre-process paste text before markdown parsing: escape <tag> patterns
whose tag name is not a standard HTML element, while respecting inline
code spans and fenced code blocks. Standard HTML (div, br, img, etc.)
passes through normally.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
Co-authored-by: multica-agent <github@multica.ai>

* fix(editor): preserve raw html-like text on paste

* fix(editor): prefer rich html paste when semantic

* fix(editor): avoid native paste when html drops raw tags

---------

Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com>
Co-authored-by: multica-agent <github@multica.ai>
2026-05-27 17:14:10 +08:00

411 lines
12 KiB
TypeScript

/**
* Markdown paste extension — ensures pasted text is parsed as Markdown.
*
* Problem: The browser clipboard can contain BOTH text/plain and text/html.
* ProseMirror always prefers text/html when present (hardcoded in
* parseFromClipboard: `let asText = !html`). When copying from VS Code,
* text editors, or .md files, the OS wraps text in <pre>/<div> HTML tags.
* ProseMirror parses these as code blocks — wrong.
*
* Solution: Use `handlePaste` (the only ProseMirror prop that runs for ALL
* paste events and has access to raw ClipboardEvent). We check for
* `data-pm-slice` in the HTML — this attribute is added by ProseMirror's
* own clipboard serializer. If present, the source is another ProseMirror
* editor and its HTML is structurally correct — let ProseMirror handle it.
* Otherwise, classify text/plain into one of three paths:
* - native: let ProseMirror or another extension handle it
* - literal: insert exact text without Markdown parsing
* - markdown: parse text/plain as Markdown
*
* Why not clipboardTextParser? It only runs when there's NO text/html on
* the clipboard (ProseMirror source: `let asText = !!text && !html`).
*
* HTML/text classification is intentionally conservative. Rich semantic HTML
* should stay native so links, lists, emphasis, and inline code survive.
* Syntax-highlight wrappers from editors (<pre>/<code>/<span>/<div>) are not
* enough by themselves, because those should still paste as Markdown source.
*/
import { Extension } from "@tiptap/core";
import { Plugin, PluginKey } from "@tiptap/pm/state";
import { Slice } from "@tiptap/pm/model";
const LARGE_PASTE_TEXT_THRESHOLD = 50_000;
const SEMANTIC_RICH_HTML_SELECTOR = [
"a[href]",
"b",
"blockquote",
"del",
"details",
"em",
"figcaption",
"figure",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"hr",
"i",
"img",
"li",
"mark",
"ol",
"s",
"strong",
"sub",
"summary",
"sup",
"table",
"tbody",
"td",
"tfoot",
"th",
"thead",
"tr",
"u",
"ul",
].join(",");
const RAW_HTML_TAG_RE = /<(\/?[a-zA-Z][a-zA-Z0-9-]*)(?:\s[^>]*)?\/?>/g;
// CommonMark treats <word> as raw HTML regardless of whether "word" is a real
// HTML element. For plain-text paste, the user's text is the source of truth, so
// escape tag-like runs before the Markdown lexer can classify them as HTML.
function escapeRawHtmlTagsInSegment(segment: string): string {
return segment.replace(
RAW_HTML_TAG_RE,
(match) => match.replaceAll("<", "&lt;").replaceAll(">", "&gt;"),
);
}
function collectRawHtmlTagsInSegment(segment: string): string[] {
return segment.match(RAW_HTML_TAG_RE) ?? [];
}
function escapeTagsOutsideCodeSpans(line: string): string {
const parts: string[] = [];
let i = 0;
while (i < line.length) {
if (line[i] === "`") {
let count = 0;
while (i + count < line.length && line[i + count] === "`") count++;
const delimiter = "`".repeat(count);
const afterOpener = i + count;
let closerIdx = afterOpener;
let found = false;
while (closerIdx <= line.length - count) {
const idx = line.indexOf(delimiter, closerIdx);
if (idx === -1) break;
if (
(idx + count >= line.length || line[idx + count] !== "`") &&
(idx === 0 || line[idx - 1] !== "`")
) {
parts.push(line.slice(i, idx + count));
i = idx + count;
found = true;
break;
}
closerIdx = idx + 1;
}
if (!found) {
parts.push(escapeRawHtmlTagsInSegment(delimiter));
i = afterOpener;
}
continue;
}
const nextBacktick = line.indexOf("`", i);
const end = nextBacktick === -1 ? line.length : nextBacktick;
parts.push(escapeRawHtmlTagsInSegment(line.slice(i, end)));
i = end;
}
return parts.join("");
}
function collectTagsOutsideCodeSpans(line: string): string[] {
const tags: string[] = [];
let i = 0;
while (i < line.length) {
if (line[i] === "`") {
let count = 0;
while (i + count < line.length && line[i + count] === "`") count++;
const delimiter = "`".repeat(count);
const afterOpener = i + count;
let closerIdx = afterOpener;
let found = false;
while (closerIdx <= line.length - count) {
const idx = line.indexOf(delimiter, closerIdx);
if (idx === -1) break;
if (
(idx + count >= line.length || line[idx + count] !== "`") &&
(idx === 0 || line[idx - 1] !== "`")
) {
i = idx + count;
found = true;
break;
}
closerIdx = idx + 1;
}
if (!found) {
i = afterOpener;
}
continue;
}
const nextBacktick = line.indexOf("`", i);
const end = nextBacktick === -1 ? line.length : nextBacktick;
tags.push(...collectRawHtmlTagsInSegment(line.slice(i, end)));
i = end;
}
return tags;
}
export function escapeRawHtmlTagsOutsideCode(text: string): string {
const lines = text.split("\n");
let inFencedBlock = false;
let fenceChar = "";
let fenceLen = 0;
const processed = lines.map((line) => {
const fenceMatch = line.match(/^ {0,3}(`{3,}|~{3,})/);
const fence = fenceMatch?.[1];
if (fence) {
if (!inFencedBlock) {
inFencedBlock = true;
fenceChar = fence.charAt(0);
fenceLen = fence.length;
return line;
}
const isClosingFence =
fence.charAt(0) === fenceChar &&
fence.length >= fenceLen &&
/^ {0,3}(`{3,}|~{3,})[ \t]*$/.test(line);
if (isClosingFence) {
inFencedBlock = false;
return line;
}
}
if (inFencedBlock) return line;
return escapeTagsOutsideCodeSpans(line);
});
return processed.join("\n");
}
function findRawHtmlTagsOutsideCode(text: string): string[] {
const lines = text.split("\n");
const tags: string[] = [];
let inFencedBlock = false;
let fenceChar = "";
let fenceLen = 0;
for (const line of lines) {
const fenceMatch = line.match(/^ {0,3}(`{3,}|~{3,})/);
const fence = fenceMatch?.[1];
if (fence) {
if (!inFencedBlock) {
inFencedBlock = true;
fenceChar = fence.charAt(0);
fenceLen = fence.length;
continue;
}
const isClosingFence =
fence.charAt(0) === fenceChar &&
fence.length >= fenceLen &&
/^ {0,3}(`{3,}|~{3,})[ \t]*$/.test(line);
if (isClosingFence) {
inFencedBlock = false;
continue;
}
}
if (!inFencedBlock) {
tags.push(...collectTagsOutsideCodeSpans(line));
}
}
return tags;
}
type PasteMode = "native" | "literal" | "markdown";
interface PasteClassificationInput {
text: string;
html: string;
hasFiles: boolean;
isInsideCodeBlock: boolean;
}
function isJsonDocumentText(text: string): boolean {
const trimmed = text.trim();
if (!trimmed) return false;
const startsLikeJson =
(trimmed.startsWith("{") && trimmed.endsWith("}")) ||
(trimmed.startsWith("[") && trimmed.endsWith("]"));
if (!startsLikeJson) return false;
try {
JSON.parse(trimmed);
return true;
} catch {
return false;
}
}
function isStructuredPlainText(text: string): boolean {
return isJsonDocumentText(text);
}
function hasRichStyle(style: string): boolean {
const normalized = style.toLowerCase();
return (
/font-weight\s*:\s*(bold|[6-9]00)\b/.test(normalized) ||
/font-style\s*:\s*italic\b/.test(normalized) ||
/text-decoration[^;]*(line-through|underline)/.test(normalized)
);
}
function countOccurrences(text: string, needle: string): number {
if (!needle) return 0;
let count = 0;
let index = text.indexOf(needle);
while (index !== -1) {
count++;
index = text.indexOf(needle, index + needle.length);
}
return count;
}
function htmlPreservesRawTagsFromPlainText(html: string, text: string): boolean {
const tags = findRawHtmlTagsOutsideCode(text);
if (tags.length === 0) return true;
if (typeof DOMParser === "undefined") return false;
const doc = new DOMParser().parseFromString(html, "text/html");
const htmlText = doc.body?.textContent ?? "";
const expectedCounts = new Map<string, number>();
for (const tag of tags) {
expectedCounts.set(tag, (expectedCounts.get(tag) ?? 0) + 1);
}
for (const [tag, expectedCount] of expectedCounts) {
if (countOccurrences(htmlText, tag) < expectedCount) return false;
}
return true;
}
function hasSemanticRichHtml(html: string, text: string): boolean {
if (!html.trim()) return false;
if (typeof DOMParser === "undefined") return false;
if (!htmlPreservesRawTagsFromPlainText(html, text)) return false;
const doc = new DOMParser().parseFromString(html, "text/html");
const { body } = doc;
if (!body) return false;
if (body.querySelector(SEMANTIC_RICH_HTML_SELECTOR)) return true;
// Inline <code> carries meaningful rich-text semantics. A <pre><code> pair
// alone is often just a syntax-highlight wrapper from editors, so keep that
// path available for Markdown parsing.
for (const code of Array.from(body.querySelectorAll("code"))) {
if (!code.closest("pre")) return true;
}
for (const el of Array.from(body.querySelectorAll<HTMLElement>("[style]"))) {
if (hasRichStyle(el.getAttribute("style") ?? "")) return true;
}
return false;
}
function classifyPaste({
text,
html,
hasFiles,
isInsideCodeBlock,
}: PasteClassificationInput): PasteMode {
if (hasFiles) return "native";
if (!text) return "native";
if (isInsideCodeBlock) return "literal";
if (html && html.includes("data-pm-slice")) return "native";
if (html && hasSemanticRichHtml(html, text)) return "native";
if (text.length > LARGE_PASTE_TEXT_THRESHOLD) return "literal";
if (isStructuredPlainText(text)) return "literal";
return "markdown";
}
export function createMarkdownPasteExtension() {
return Extension.create({
name: "markdownPaste",
addProseMirrorPlugins() {
const { editor } = this;
return [
new Plugin({
key: new PluginKey("markdownPaste"),
props: {
handlePaste(view, event) {
if (!editor.markdown) return false;
const clipboard = event.clipboardData;
if (!clipboard) return false;
const text = clipboard.getData("text/plain");
const html = clipboard.getData("text/html");
const { $from } = view.state.selection;
const mode = classifyPaste({
text,
html,
hasFiles: Boolean(clipboard.files?.length),
isInsideCodeBlock: $from.parent.type.name === "codeBlock",
});
if (mode === "native") return false;
if (mode === "literal") {
view.dispatch(view.state.tr.insertText(text));
return true;
}
// Everything else (VS Code, text editors, .md files, terminals,
// web pages): parse text/plain as Markdown.
const preprocessed = escapeRawHtmlTagsOutsideCode(text);
const json = editor.markdown.parse(preprocessed);
const node = editor.schema.nodeFromJSON(json);
// Safety net: if parsing still produces an empty doc despite
// non-empty input, fall back to literal insertion.
const first = node.content.firstChild;
const parsedEmpty =
node.content.childCount === 0 ||
(node.content.childCount === 1 &&
first?.type.name === "paragraph" &&
first.content.size === 0);
if (text.trim() && parsedEmpty) {
view.dispatch(view.state.tr.insertText(text));
return true;
}
const slice = Slice.maxOpen(node.content);
const tr = view.state.tr.replaceSelection(slice);
view.dispatch(tr);
return true;
},
},
}),
];
},
});
}