mirror of
https://github.com/multica-ai/multica.git
synced 2026-07-05 13:29:44 +02:00
fix(skills): harden import against invalid UTF-8 and binary files
PG rejects two byte patterns in a TEXT column. Both crashed real skill imports we hit while assembling the template catalog: - Embedded NUL (0x00) -> SQLSTATE 22021. Already stripped by sanitizeNullBytes, kept as-is. - Other invalid UTF-8 (e.g. 0x91 — Windows-1252 smart quote in a skill whose author saved prose from Word). sanitizeNullBytes now also runs strings.ToValidUTF8 over the content so the second class no longer takes the whole import down. For non-text payloads (images, fonts, archives, compiled binaries), sanitization isn't the right fix — agents never read those as text, and the bytes can't survive a TEXT column at all. addFile now skips them by extension before the per-bundle cap counters tick, logging the skip so an unexpected drop leaves a breadcrumb. Function name kept for compatibility with the many call sites; both behaviours are strict supersets of the original. Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -19,11 +19,18 @@ import (
|
|||||||
"github.com/multica-ai/multica/server/pkg/protocol"
|
"github.com/multica-ai/multica/server/pkg/protocol"
|
||||||
)
|
)
|
||||||
|
|
||||||
// sanitizeNullBytes removes null bytes (0x00) from strings.
|
// sanitizeNullBytes makes a string safe for a PostgreSQL TEXT column.
|
||||||
// PostgreSQL rejects null bytes in text columns with
|
//
|
||||||
// "invalid byte sequence for encoding UTF8: 0x00 (SQLSTATE 22021)".
|
// Two failure modes covered:
|
||||||
|
// - Embedded NUL (0x00) — PG rejects with SQLSTATE 22021. Removed.
|
||||||
|
// - Other invalid-UTF-8 byte sequences (e.g. 0x91 = Windows-1252 smart
|
||||||
|
// quote, which crashed agent-template import of skills containing
|
||||||
|
// Windows-encoded prose). `strings.ToValidUTF8` drops them.
|
||||||
|
//
|
||||||
|
// Name is kept for compatibility with the many call sites; the behaviour
|
||||||
|
// is a strict superset of the original.
|
||||||
func sanitizeNullBytes(s string) string {
|
func sanitizeNullBytes(s string) string {
|
||||||
return strings.ReplaceAll(s, "\x00", "")
|
return strings.ToValidUTF8(strings.ReplaceAll(s, "\x00", ""), "")
|
||||||
}
|
}
|
||||||
|
|
||||||
// --- Response structs ---
|
// --- Response structs ---
|
||||||
@@ -481,7 +488,16 @@ func isCapError(err error) bool {
|
|||||||
// addFile appends a supporting file while enforcing the per-bundle caps. It
|
// addFile appends a supporting file while enforcing the per-bundle caps. It
|
||||||
// returns an error when either the file count or aggregate byte budget would
|
// returns an error when either the file count or aggregate byte budget would
|
||||||
// be exceeded so the caller fails the import instead of silently truncating.
|
// be exceeded so the caller fails the import instead of silently truncating.
|
||||||
|
//
|
||||||
|
// Binary files (images, fonts, archives) are silently skipped: their bytes
|
||||||
|
// can't survive a PG TEXT column (SQLSTATE 22021), and they're reference
|
||||||
|
// assets the agent never reads as text anyway. Logging the skip leaves a
|
||||||
|
// breadcrumb if a user expected one of these to import.
|
||||||
func (s *importedSkill) addFile(path, content string) error {
|
func (s *importedSkill) addFile(path, content string) error {
|
||||||
|
if isLikelyBinaryFilePath(path) {
|
||||||
|
slog.Info("skill import: skipping binary file", "path", path, "size", len(content))
|
||||||
|
return nil
|
||||||
|
}
|
||||||
if len(s.files) >= maxImportFileCount {
|
if len(s.files) >= maxImportFileCount {
|
||||||
return fmt.Errorf("%w: import bundle exceeds %d file limit", errImportCapExceeded, maxImportFileCount)
|
return fmt.Errorf("%w: import bundle exceeds %d file limit", errImportCapExceeded, maxImportFileCount)
|
||||||
}
|
}
|
||||||
@@ -493,6 +509,34 @@ func (s *importedSkill) addFile(path, content string) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// isLikelyBinaryFilePath reports whether the file's extension indicates a
|
||||||
|
// non-text payload. Conservative blacklist — extensions not on the list
|
||||||
|
// are assumed text and pass through. `sanitizeNullBytes` (called at PG
|
||||||
|
// insert time) is the second-line defence against any text file that
|
||||||
|
// turns out to have stray invalid-UTF-8 bytes.
|
||||||
|
func isLikelyBinaryFilePath(path string) bool {
|
||||||
|
ext := strings.ToLower(filepath.Ext(path))
|
||||||
|
switch ext {
|
||||||
|
case
|
||||||
|
// images
|
||||||
|
".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".ico", ".heic",
|
||||||
|
// fonts
|
||||||
|
".ttf", ".otf", ".woff", ".woff2", ".eot",
|
||||||
|
// archives
|
||||||
|
".zip", ".gz", ".tar", ".bz2", ".7z", ".rar",
|
||||||
|
// documents (binary office)
|
||||||
|
".pdf", ".docx", ".xlsx", ".pptx", ".doc", ".xls", ".ppt",
|
||||||
|
// media
|
||||||
|
".mp3", ".mp4", ".wav", ".avi", ".mov", ".webm", ".m4a", ".flac",
|
||||||
|
// compiled / executable
|
||||||
|
".exe", ".dll", ".so", ".dylib", ".class", ".jar", ".wasm",
|
||||||
|
// db / cache
|
||||||
|
".db", ".sqlite", ".sqlite3", ".pyc":
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
// --- ClawHub types ---
|
// --- ClawHub types ---
|
||||||
|
|
||||||
type clawhubGetSkillResponse struct {
|
type clawhubGetSkillResponse struct {
|
||||||
|
|||||||
Reference in New Issue
Block a user