mirror of
https://github.com/multica-ai/multica.git
synced 2026-06-17 03:38:32 +02:00
* i18n: add japanese locale * fix: spacing issues * refactor * fix(desktop): set <html lang> before paint to avoid JA Kanji font flash Switch the documentElement.lang sync from useEffect to useLayoutEffect so lang is committed before the first paint. Otherwise Japanese desktop users saw one frame of Kanji rendered with the Chinese-first fallback stack before the html[lang|="ja"] CJK override applied. Also fix the stale selector in the HTML_LANG comment (html[lang^="ja"] -> html[lang|="ja"]). Addresses review nits on MUL-2893. Co-authored-by: multica-agent <github@multica.ai> * fix(docs): tokenize the ideographic iteration mark in JA search Add U+3005 (々) to the Japanese search tokenizer character class. It sits just below the kana blocks, so words like 様々 / 日々 / 個々 previously dropped the mark and split awkwardly, hurting recall. Addresses a review nit on MUL-2893. Co-authored-by: multica-agent <github@multica.ai> * fix(i18n): restore ja locale parity after merging main Merging main brought new EN strings into agents/chat/onboarding/settings/ squads that the ja bundle (authored against an older snapshot) lacked, breaking the locales parity test. Add the Japanese translations for the new keys (workspace logo upload, agents runtime filter, chat session-history stop dialog, onboarding social_github, squad archived status) and drop the two renamed chat window keys (active_group / archived_group) that EN removed in favour of history_group. Fixes the failing @multica/views parity.test.ts on the FE CI for MUL-2893. Co-authored-by: multica-agent <github@multica.ai> --------- Co-authored-by: J <j@multica.ai> Co-authored-by: multica-agent <github@multica.ai>
67 lines
2.2 KiB
TypeScript
67 lines
2.2 KiB
TypeScript
import { source } from "@/lib/source";
|
|
import { createFromSource } from "fumadocs-core/search/server";
|
|
|
|
// Orama doesn't ship a Chinese tokenizer and its built-in English regex
|
|
// strips Han characters entirely, so `locale=zh` would either return empty
|
|
// results or throw. Tokenize CJK input character-by-character and keep
|
|
// Latin/digit runs whole — gives serviceable recall for Chinese docs while
|
|
// letting Romanized terms (product names, CLI commands) still match.
|
|
function tokenizeCJK(raw: string): string[] {
|
|
const tokens: string[] = [];
|
|
const regex = /[一-鿿㐀-䶿]|[A-Za-z0-9]+/g;
|
|
const lower = raw.toLowerCase();
|
|
let match: RegExpExecArray | null;
|
|
while ((match = regex.exec(lower)) !== null) {
|
|
tokens.push(match[0]);
|
|
}
|
|
return tokens;
|
|
}
|
|
|
|
// Japanese mixes Hiragana, Katakana and Kanji; the English regex strips them
|
|
// all, and the zh tokenizer only keeps Han (Kanji), dropping kana entirely.
|
|
// Tokenize each kana/Kanji codepoint on its own and keep Latin/digit runs
|
|
// whole — same character-level recall strategy as tokenizeCJK, extended to
|
|
// the Hiragana (\u3040-\u309f) and Katakana (\u30a0-\u30ff) blocks, plus the
|
|
// ideographic iteration mark \u3005 which sits just below the kana blocks and
|
|
// recurs in common words (e.g. the JP for "various", "daily", "individual").
|
|
function tokenizeJapanese(raw: string): string[] {
|
|
const tokens: string[] = [];
|
|
const regex = /[\u3005\u3040-\u30ff\u3400-\u4dbf\u4e00-\u9fff]|[A-Za-z0-9]+/g;
|
|
const lower = raw.toLowerCase();
|
|
let match: RegExpExecArray | null;
|
|
while ((match = regex.exec(lower)) !== null) {
|
|
tokens.push(match[0]);
|
|
}
|
|
return tokens;
|
|
}
|
|
|
|
export const { GET } = createFromSource(source, {
|
|
localeMap: {
|
|
ko: {
|
|
components: {
|
|
tokenizer: {
|
|
language: "english",
|
|
},
|
|
},
|
|
},
|
|
ja: {
|
|
components: {
|
|
tokenizer: {
|
|
language: "english",
|
|
normalizationCache: new Map(),
|
|
tokenize: tokenizeJapanese,
|
|
},
|
|
},
|
|
},
|
|
zh: {
|
|
components: {
|
|
tokenizer: {
|
|
language: "english",
|
|
normalizationCache: new Map(),
|
|
tokenize: tokenizeCJK,
|
|
},
|
|
},
|
|
},
|
|
},
|
|
});
|