Compare commits

...

1 Commits

Author SHA1 Message Date
Naiyuan Qing
bb77ec0296 perf(editor): parse large markdown in chunks to fix O(n²) freeze
@tiptap/markdown parses via marked, whose tokenizer is O(n²) in document
length. Opening a large markdown doc (issue description, agent
instructions, …) froze the UI for tens of seconds: a 533KB plain-text doc
took 61.8s to parse while the subsequent ProseMirror setContent was only
40ms. Upgrading marked doesn't help — already on 17.0.5, whose fix only
covers `_`/`*` delimiter runs, not general prose.

Parse large markdown in chunks instead of in one shot: split on blank
lines outside fenced code blocks, parse each chunk independently, then
concatenate the resulting docs. This drops marked's cost to O(n²/k) while
producing a byte-identical document. Applied transparently at
ContentEditor's two parse entry points (mount + WS-driven re-parse), gated
at 50KB so normal small docs stay on the single-parse fast path.

533KB: parse 61.8s -> 0.95s (65x), open 100s -> 3.2s (31x).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-05 16:20:45 +08:00
2 changed files with 112 additions and 6 deletions

View File

@@ -43,6 +43,11 @@ import type { UploadResult } from "@multica/core/hooks/use-file-upload";
import { useWorkspaceSlug } from "@multica/core/paths";
import { useQueryClient } from "@tanstack/react-query";
import type { Attachment } from "@multica/core/types";
import {
parseMarkdownChunked,
MARKDOWN_CHUNK_THRESHOLD,
type MarkdownManagerLike,
} from "./utils/parse-markdown-chunked";
import type { MentionItem } from "./extensions/mention-suggestion";
import { createEditorExtensions } from "./extensions";
import { uploadAndInsertFile } from "./extensions/file-upload";
@@ -176,16 +181,43 @@ const ContentEditor = forwardRef<ContentEditorRef, ContentEditorProps>(
const queryClient = useQueryClient();
const initialContent = defaultValue ? preprocessMarkdown(defaultValue) : "";
// Large markdown is parsed in chunks to dodge marked's O(n²) tokenizer (see
// parseMarkdownChunked). Small docs stay on the single-parse fast path.
const mountChunked = initialContent.length > MARKDOWN_CHUNK_THRESHOLD;
const editor = useEditor({
immediatelyRender: false,
// Note: in v3.22.1 the default is already false/undefined (same behavior).
// Explicit for clarity — the real perf win is useEditorState in BubbleMenu.
shouldRerenderOnTransaction: false,
onCreate: ({ editor: ed }) => {
// For large docs we mount empty (below) and parse in chunks here, so the
// O(n²) marked tokenizer never sees the whole document at once.
if (mountChunked) {
const manager = (
ed.storage as { markdown?: { manager?: MarkdownManagerLike } }
).markdown?.manager;
if (manager) {
ed.commands.setContent(
parseMarkdownChunked(manager, initialContent),
{ emitUpdate: false },
);
} else {
ed.commands.setContent(initialContent, {
emitUpdate: false,
contentType: "markdown",
});
}
}
lastEmittedRef.current = stripBlobUrls(ed.getMarkdown()).trimEnd();
},
content: defaultValue ? preprocessMarkdown(defaultValue) : "",
contentType: defaultValue ? "markdown" : undefined,
content: mountChunked ? "" : initialContent,
contentType: mountChunked
? undefined
: defaultValue
? "markdown"
: undefined,
extensions: createEditorExtensions({
placeholder: placeholderText,
queryClient,
@@ -276,10 +308,22 @@ const ContentEditor = forwardRef<ContentEditorRef, ContentEditorProps>(
// `emitUpdate: true`; without this we would re-trigger onUpdate →
// server save → self-write loop.
const { from, to } = editor.state.selection;
editor.commands.setContent(incoming, {
emitUpdate: false,
contentType: "markdown",
});
// Same chunked path on WS-driven re-parse of a large description.
const manager =
incoming.length > MARKDOWN_CHUNK_THRESHOLD
? (editor.storage as { markdown?: { manager?: MarkdownManagerLike } })
.markdown?.manager
: undefined;
if (manager) {
editor.commands.setContent(parseMarkdownChunked(manager, incoming), {
emitUpdate: false,
});
} else {
editor.commands.setContent(incoming, {
emitUpdate: false,
contentType: "markdown",
});
}
// Clamp prior selection to the new doc size so the caret doesn't snap
// to position 0 after ProseMirror replaces the document.

View File

@@ -0,0 +1,62 @@
import type { JSONContent } from "@tiptap/core";
/**
* Above this source size, ContentEditor parses markdown in chunks instead of in
* one shot. `@tiptap/markdown` parses via `marked`, whose tokenizer is O(n²) in
* document length (measured: 533KB plain text → 61.8s parse, while the following
* ProseMirror setContent is only 40ms). Whole-document parse is the bottleneck;
* below this threshold the single-parse path is fast enough and stays in use.
*/
export const MARKDOWN_CHUNK_THRESHOLD = 50_000;
export interface MarkdownManagerLike {
parse(markdown: string): JSONContent;
}
/**
* Parse markdown into a ProseMirror JSON doc in chunks to dodge marked's O(n²).
*
* Splitting into k chunks and parsing each independently drops the cost to
* O(n²/k) — marked only ever scans within one small chunk. Cuts happen only at
* blank lines OUTSIDE fenced code blocks, so every chunk is a complete sequence
* of block nodes; concatenating the per-chunk docs reproduces the same document
* a single parse would have produced.
*
* Known limitation: a "loose" list (items separated by blank lines) straddling a
* chunk boundary may render as two adjacent lists. Acceptable trade-off vs. a
* minute-long freeze, and only reachable on documents past the threshold.
*/
export function parseMarkdownChunked(
manager: MarkdownManagerLike,
markdown: string,
chunkSize = 16_000,
): JSONContent {
const lines = markdown.split("\n");
const chunks: string[] = [];
let current: string[] = [];
let currentLen = 0;
let inFence = false;
for (const line of lines) {
// Track fenced code blocks so a cut never lands inside one.
if (/^\s*(```|~~~)/.test(line)) inFence = !inFence;
current.push(line);
currentLen += line.length + 1;
// Cut only at a paragraph boundary (blank line) outside a fence, once the
// accumulated chunk is large enough.
if (currentLen >= chunkSize && !inFence && line.trim() === "") {
chunks.push(current.join("\n"));
current = [];
currentLen = 0;
}
}
if (current.length) chunks.push(current.join("\n"));
const merged: JSONContent = { type: "doc", content: [] };
for (const chunk of chunks) {
const doc = manager.parse(chunk);
if (doc.content) merged.content!.push(...doc.content);
}
return merged;
}