mirror of
https://github.com/multica-ai/multica.git
synced 2026-07-05 13:29:44 +02:00
* fix(editor): fall back to literal paste when markdown parser drops all content When pasting text like `<T>` or `<MyComponent>`, the CommonMark-compliant markdown parser treats them as inline HTML tags. ProseMirror's schema doesn't recognize unknown HTML elements, so they are silently dropped — producing an empty document from non-empty input. Detect this case (non-empty input → empty parse result) and fall back to literal text insertion so the user sees their text instead of nothing. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> Co-authored-by: multica-agent <github@multica.ai> * fix(editor): escape non-standard HTML tags in paste to prevent content loss When pasting mixed content containing multiple <tag> patterns (e.g. "<t>\n裸 `<tag>` 做转\n<tag>\n<t>"), CommonMark treats bare <word> as inline HTML. ProseMirror silently drops unknown HTML elements, causing partial content loss. The previous empty-result fallback only caught the single-tag case where the entire parse result was empty. Pre-process paste text before markdown parsing: escape <tag> patterns whose tag name is not a standard HTML element, while respecting inline code spans and fenced code blocks. Standard HTML (div, br, img, etc.) passes through normally. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> Co-authored-by: multica-agent <github@multica.ai> * fix(editor): preserve raw html-like text on paste * fix(editor): prefer rich html paste when semantic * fix(editor): avoid native paste when html drops raw tags --------- Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com> Co-authored-by: multica-agent <github@multica.ai>
411 lines
12 KiB
TypeScript
411 lines
12 KiB
TypeScript
/**
|
|
* Markdown paste extension — ensures pasted text is parsed as Markdown.
|
|
*
|
|
* Problem: The browser clipboard can contain BOTH text/plain and text/html.
|
|
* ProseMirror always prefers text/html when present (hardcoded in
|
|
* parseFromClipboard: `let asText = !html`). When copying from VS Code,
|
|
* text editors, or .md files, the OS wraps text in <pre>/<div> HTML tags.
|
|
* ProseMirror parses these as code blocks — wrong.
|
|
*
|
|
* Solution: Use `handlePaste` (the only ProseMirror prop that runs for ALL
|
|
* paste events and has access to raw ClipboardEvent). We check for
|
|
* `data-pm-slice` in the HTML — this attribute is added by ProseMirror's
|
|
* own clipboard serializer. If present, the source is another ProseMirror
|
|
* editor and its HTML is structurally correct — let ProseMirror handle it.
|
|
* Otherwise, classify text/plain into one of three paths:
|
|
* - native: let ProseMirror or another extension handle it
|
|
* - literal: insert exact text without Markdown parsing
|
|
* - markdown: parse text/plain as Markdown
|
|
*
|
|
* Why not clipboardTextParser? It only runs when there's NO text/html on
|
|
* the clipboard (ProseMirror source: `let asText = !!text && !html`).
|
|
*
|
|
* HTML/text classification is intentionally conservative. Rich semantic HTML
|
|
* should stay native so links, lists, emphasis, and inline code survive.
|
|
* Syntax-highlight wrappers from editors (<pre>/<code>/<span>/<div>) are not
|
|
* enough by themselves, because those should still paste as Markdown source.
|
|
*/
|
|
import { Extension } from "@tiptap/core";
|
|
import { Plugin, PluginKey } from "@tiptap/pm/state";
|
|
import { Slice } from "@tiptap/pm/model";
|
|
|
|
const LARGE_PASTE_TEXT_THRESHOLD = 50_000;
|
|
const SEMANTIC_RICH_HTML_SELECTOR = [
|
|
"a[href]",
|
|
"b",
|
|
"blockquote",
|
|
"del",
|
|
"details",
|
|
"em",
|
|
"figcaption",
|
|
"figure",
|
|
"h1",
|
|
"h2",
|
|
"h3",
|
|
"h4",
|
|
"h5",
|
|
"h6",
|
|
"hr",
|
|
"i",
|
|
"img",
|
|
"li",
|
|
"mark",
|
|
"ol",
|
|
"s",
|
|
"strong",
|
|
"sub",
|
|
"summary",
|
|
"sup",
|
|
"table",
|
|
"tbody",
|
|
"td",
|
|
"tfoot",
|
|
"th",
|
|
"thead",
|
|
"tr",
|
|
"u",
|
|
"ul",
|
|
].join(",");
|
|
const RAW_HTML_TAG_RE = /<(\/?[a-zA-Z][a-zA-Z0-9-]*)(?:\s[^>]*)?\/?>/g;
|
|
|
|
// CommonMark treats <word> as raw HTML regardless of whether "word" is a real
|
|
// HTML element. For plain-text paste, the user's text is the source of truth, so
|
|
// escape tag-like runs before the Markdown lexer can classify them as HTML.
|
|
function escapeRawHtmlTagsInSegment(segment: string): string {
|
|
return segment.replace(
|
|
RAW_HTML_TAG_RE,
|
|
(match) => match.replaceAll("<", "<").replaceAll(">", ">"),
|
|
);
|
|
}
|
|
|
|
function collectRawHtmlTagsInSegment(segment: string): string[] {
|
|
return segment.match(RAW_HTML_TAG_RE) ?? [];
|
|
}
|
|
|
|
function escapeTagsOutsideCodeSpans(line: string): string {
|
|
const parts: string[] = [];
|
|
let i = 0;
|
|
|
|
while (i < line.length) {
|
|
if (line[i] === "`") {
|
|
let count = 0;
|
|
while (i + count < line.length && line[i + count] === "`") count++;
|
|
const delimiter = "`".repeat(count);
|
|
const afterOpener = i + count;
|
|
|
|
let closerIdx = afterOpener;
|
|
let found = false;
|
|
while (closerIdx <= line.length - count) {
|
|
const idx = line.indexOf(delimiter, closerIdx);
|
|
if (idx === -1) break;
|
|
if (
|
|
(idx + count >= line.length || line[idx + count] !== "`") &&
|
|
(idx === 0 || line[idx - 1] !== "`")
|
|
) {
|
|
parts.push(line.slice(i, idx + count));
|
|
i = idx + count;
|
|
found = true;
|
|
break;
|
|
}
|
|
closerIdx = idx + 1;
|
|
}
|
|
|
|
if (!found) {
|
|
parts.push(escapeRawHtmlTagsInSegment(delimiter));
|
|
i = afterOpener;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
const nextBacktick = line.indexOf("`", i);
|
|
const end = nextBacktick === -1 ? line.length : nextBacktick;
|
|
parts.push(escapeRawHtmlTagsInSegment(line.slice(i, end)));
|
|
i = end;
|
|
}
|
|
|
|
return parts.join("");
|
|
}
|
|
|
|
function collectTagsOutsideCodeSpans(line: string): string[] {
|
|
const tags: string[] = [];
|
|
let i = 0;
|
|
|
|
while (i < line.length) {
|
|
if (line[i] === "`") {
|
|
let count = 0;
|
|
while (i + count < line.length && line[i + count] === "`") count++;
|
|
const delimiter = "`".repeat(count);
|
|
const afterOpener = i + count;
|
|
|
|
let closerIdx = afterOpener;
|
|
let found = false;
|
|
while (closerIdx <= line.length - count) {
|
|
const idx = line.indexOf(delimiter, closerIdx);
|
|
if (idx === -1) break;
|
|
if (
|
|
(idx + count >= line.length || line[idx + count] !== "`") &&
|
|
(idx === 0 || line[idx - 1] !== "`")
|
|
) {
|
|
i = idx + count;
|
|
found = true;
|
|
break;
|
|
}
|
|
closerIdx = idx + 1;
|
|
}
|
|
|
|
if (!found) {
|
|
i = afterOpener;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
const nextBacktick = line.indexOf("`", i);
|
|
const end = nextBacktick === -1 ? line.length : nextBacktick;
|
|
tags.push(...collectRawHtmlTagsInSegment(line.slice(i, end)));
|
|
i = end;
|
|
}
|
|
|
|
return tags;
|
|
}
|
|
|
|
export function escapeRawHtmlTagsOutsideCode(text: string): string {
|
|
const lines = text.split("\n");
|
|
let inFencedBlock = false;
|
|
let fenceChar = "";
|
|
let fenceLen = 0;
|
|
|
|
const processed = lines.map((line) => {
|
|
const fenceMatch = line.match(/^ {0,3}(`{3,}|~{3,})/);
|
|
const fence = fenceMatch?.[1];
|
|
if (fence) {
|
|
if (!inFencedBlock) {
|
|
inFencedBlock = true;
|
|
fenceChar = fence.charAt(0);
|
|
fenceLen = fence.length;
|
|
return line;
|
|
}
|
|
const isClosingFence =
|
|
fence.charAt(0) === fenceChar &&
|
|
fence.length >= fenceLen &&
|
|
/^ {0,3}(`{3,}|~{3,})[ \t]*$/.test(line);
|
|
if (isClosingFence) {
|
|
inFencedBlock = false;
|
|
return line;
|
|
}
|
|
}
|
|
|
|
if (inFencedBlock) return line;
|
|
return escapeTagsOutsideCodeSpans(line);
|
|
});
|
|
|
|
return processed.join("\n");
|
|
}
|
|
|
|
function findRawHtmlTagsOutsideCode(text: string): string[] {
|
|
const lines = text.split("\n");
|
|
const tags: string[] = [];
|
|
let inFencedBlock = false;
|
|
let fenceChar = "";
|
|
let fenceLen = 0;
|
|
|
|
for (const line of lines) {
|
|
const fenceMatch = line.match(/^ {0,3}(`{3,}|~{3,})/);
|
|
const fence = fenceMatch?.[1];
|
|
if (fence) {
|
|
if (!inFencedBlock) {
|
|
inFencedBlock = true;
|
|
fenceChar = fence.charAt(0);
|
|
fenceLen = fence.length;
|
|
continue;
|
|
}
|
|
const isClosingFence =
|
|
fence.charAt(0) === fenceChar &&
|
|
fence.length >= fenceLen &&
|
|
/^ {0,3}(`{3,}|~{3,})[ \t]*$/.test(line);
|
|
if (isClosingFence) {
|
|
inFencedBlock = false;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (!inFencedBlock) {
|
|
tags.push(...collectTagsOutsideCodeSpans(line));
|
|
}
|
|
}
|
|
|
|
return tags;
|
|
}
|
|
|
|
type PasteMode = "native" | "literal" | "markdown";
|
|
|
|
interface PasteClassificationInput {
|
|
text: string;
|
|
html: string;
|
|
hasFiles: boolean;
|
|
isInsideCodeBlock: boolean;
|
|
}
|
|
|
|
function isJsonDocumentText(text: string): boolean {
|
|
const trimmed = text.trim();
|
|
if (!trimmed) return false;
|
|
|
|
const startsLikeJson =
|
|
(trimmed.startsWith("{") && trimmed.endsWith("}")) ||
|
|
(trimmed.startsWith("[") && trimmed.endsWith("]"));
|
|
if (!startsLikeJson) return false;
|
|
|
|
try {
|
|
JSON.parse(trimmed);
|
|
return true;
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
function isStructuredPlainText(text: string): boolean {
|
|
return isJsonDocumentText(text);
|
|
}
|
|
|
|
function hasRichStyle(style: string): boolean {
|
|
const normalized = style.toLowerCase();
|
|
return (
|
|
/font-weight\s*:\s*(bold|[6-9]00)\b/.test(normalized) ||
|
|
/font-style\s*:\s*italic\b/.test(normalized) ||
|
|
/text-decoration[^;]*(line-through|underline)/.test(normalized)
|
|
);
|
|
}
|
|
|
|
function countOccurrences(text: string, needle: string): number {
|
|
if (!needle) return 0;
|
|
let count = 0;
|
|
let index = text.indexOf(needle);
|
|
while (index !== -1) {
|
|
count++;
|
|
index = text.indexOf(needle, index + needle.length);
|
|
}
|
|
return count;
|
|
}
|
|
|
|
function htmlPreservesRawTagsFromPlainText(html: string, text: string): boolean {
|
|
const tags = findRawHtmlTagsOutsideCode(text);
|
|
if (tags.length === 0) return true;
|
|
if (typeof DOMParser === "undefined") return false;
|
|
|
|
const doc = new DOMParser().parseFromString(html, "text/html");
|
|
const htmlText = doc.body?.textContent ?? "";
|
|
const expectedCounts = new Map<string, number>();
|
|
for (const tag of tags) {
|
|
expectedCounts.set(tag, (expectedCounts.get(tag) ?? 0) + 1);
|
|
}
|
|
|
|
for (const [tag, expectedCount] of expectedCounts) {
|
|
if (countOccurrences(htmlText, tag) < expectedCount) return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
function hasSemanticRichHtml(html: string, text: string): boolean {
|
|
if (!html.trim()) return false;
|
|
if (typeof DOMParser === "undefined") return false;
|
|
|
|
if (!htmlPreservesRawTagsFromPlainText(html, text)) return false;
|
|
|
|
const doc = new DOMParser().parseFromString(html, "text/html");
|
|
const { body } = doc;
|
|
if (!body) return false;
|
|
|
|
if (body.querySelector(SEMANTIC_RICH_HTML_SELECTOR)) return true;
|
|
|
|
// Inline <code> carries meaningful rich-text semantics. A <pre><code> pair
|
|
// alone is often just a syntax-highlight wrapper from editors, so keep that
|
|
// path available for Markdown parsing.
|
|
for (const code of Array.from(body.querySelectorAll("code"))) {
|
|
if (!code.closest("pre")) return true;
|
|
}
|
|
|
|
for (const el of Array.from(body.querySelectorAll<HTMLElement>("[style]"))) {
|
|
if (hasRichStyle(el.getAttribute("style") ?? "")) return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
function classifyPaste({
|
|
text,
|
|
html,
|
|
hasFiles,
|
|
isInsideCodeBlock,
|
|
}: PasteClassificationInput): PasteMode {
|
|
if (hasFiles) return "native";
|
|
if (!text) return "native";
|
|
if (isInsideCodeBlock) return "literal";
|
|
if (html && html.includes("data-pm-slice")) return "native";
|
|
if (html && hasSemanticRichHtml(html, text)) return "native";
|
|
if (text.length > LARGE_PASTE_TEXT_THRESHOLD) return "literal";
|
|
if (isStructuredPlainText(text)) return "literal";
|
|
return "markdown";
|
|
}
|
|
|
|
export function createMarkdownPasteExtension() {
|
|
return Extension.create({
|
|
name: "markdownPaste",
|
|
addProseMirrorPlugins() {
|
|
const { editor } = this;
|
|
return [
|
|
new Plugin({
|
|
key: new PluginKey("markdownPaste"),
|
|
props: {
|
|
handlePaste(view, event) {
|
|
if (!editor.markdown) return false;
|
|
const clipboard = event.clipboardData;
|
|
if (!clipboard) return false;
|
|
|
|
const text = clipboard.getData("text/plain");
|
|
const html = clipboard.getData("text/html");
|
|
const { $from } = view.state.selection;
|
|
const mode = classifyPaste({
|
|
text,
|
|
html,
|
|
hasFiles: Boolean(clipboard.files?.length),
|
|
isInsideCodeBlock: $from.parent.type.name === "codeBlock",
|
|
});
|
|
|
|
if (mode === "native") return false;
|
|
|
|
if (mode === "literal") {
|
|
view.dispatch(view.state.tr.insertText(text));
|
|
return true;
|
|
}
|
|
|
|
// Everything else (VS Code, text editors, .md files, terminals,
|
|
// web pages): parse text/plain as Markdown.
|
|
const preprocessed = escapeRawHtmlTagsOutsideCode(text);
|
|
const json = editor.markdown.parse(preprocessed);
|
|
const node = editor.schema.nodeFromJSON(json);
|
|
|
|
// Safety net: if parsing still produces an empty doc despite
|
|
// non-empty input, fall back to literal insertion.
|
|
const first = node.content.firstChild;
|
|
const parsedEmpty =
|
|
node.content.childCount === 0 ||
|
|
(node.content.childCount === 1 &&
|
|
first?.type.name === "paragraph" &&
|
|
first.content.size === 0);
|
|
if (text.trim() && parsedEmpty) {
|
|
view.dispatch(view.state.tr.insertText(text));
|
|
return true;
|
|
}
|
|
|
|
const slice = Slice.maxOpen(node.content);
|
|
const tr = view.state.tr.replaceSelection(slice);
|
|
view.dispatch(tr);
|
|
return true;
|
|
},
|
|
},
|
|
}),
|
|
];
|
|
},
|
|
});
|
|
}
|