mirror of
https://github.com/multica-ai/multica.git
synced 2026-06-17 03:38:32 +02:00
fix(skills): harden import against invalid UTF-8 and binary files
PG rejects two byte patterns in a TEXT column. Both crashed real skill imports we hit while assembling the template catalog: - Embedded NUL (0x00) -> SQLSTATE 22021. Already stripped by sanitizeNullBytes, kept as-is. - Other invalid UTF-8 (e.g. 0x91 — Windows-1252 smart quote in a skill whose author saved prose from Word). sanitizeNullBytes now also runs strings.ToValidUTF8 over the content so the second class no longer takes the whole import down. For non-text payloads (images, fonts, archives, compiled binaries), sanitization isn't the right fix — agents never read those as text, and the bytes can't survive a TEXT column at all. addFile now skips them by extension before the per-bundle cap counters tick, logging the skip so an unexpected drop leaves a breadcrumb. Function name kept for compatibility with the many call sites; both behaviours are strict supersets of the original. Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -19,11 +19,18 @@ import (
|
||||
"github.com/multica-ai/multica/server/pkg/protocol"
|
||||
)
|
||||
|
||||
// sanitizeNullBytes removes null bytes (0x00) from strings.
|
||||
// PostgreSQL rejects null bytes in text columns with
|
||||
// "invalid byte sequence for encoding UTF8: 0x00 (SQLSTATE 22021)".
|
||||
// sanitizeNullBytes makes a string safe for a PostgreSQL TEXT column.
|
||||
//
|
||||
// Two failure modes covered:
|
||||
// - Embedded NUL (0x00) — PG rejects with SQLSTATE 22021. Removed.
|
||||
// - Other invalid-UTF-8 byte sequences (e.g. 0x91 = Windows-1252 smart
|
||||
// quote, which crashed agent-template import of skills containing
|
||||
// Windows-encoded prose). `strings.ToValidUTF8` drops them.
|
||||
//
|
||||
// Name is kept for compatibility with the many call sites; the behaviour
|
||||
// is a strict superset of the original.
|
||||
func sanitizeNullBytes(s string) string {
|
||||
return strings.ReplaceAll(s, "\x00", "")
|
||||
return strings.ToValidUTF8(strings.ReplaceAll(s, "\x00", ""), "")
|
||||
}
|
||||
|
||||
// --- Response structs ---
|
||||
@@ -481,7 +488,16 @@ func isCapError(err error) bool {
|
||||
// addFile appends a supporting file while enforcing the per-bundle caps. It
|
||||
// returns an error when either the file count or aggregate byte budget would
|
||||
// be exceeded so the caller fails the import instead of silently truncating.
|
||||
//
|
||||
// Binary files (images, fonts, archives) are silently skipped: their bytes
|
||||
// can't survive a PG TEXT column (SQLSTATE 22021), and they're reference
|
||||
// assets the agent never reads as text anyway. Logging the skip leaves a
|
||||
// breadcrumb if a user expected one of these to import.
|
||||
func (s *importedSkill) addFile(path, content string) error {
|
||||
if isLikelyBinaryFilePath(path) {
|
||||
slog.Info("skill import: skipping binary file", "path", path, "size", len(content))
|
||||
return nil
|
||||
}
|
||||
if len(s.files) >= maxImportFileCount {
|
||||
return fmt.Errorf("%w: import bundle exceeds %d file limit", errImportCapExceeded, maxImportFileCount)
|
||||
}
|
||||
@@ -493,6 +509,34 @@ func (s *importedSkill) addFile(path, content string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// isLikelyBinaryFilePath reports whether the file's extension indicates a
|
||||
// non-text payload. Conservative blacklist — extensions not on the list
|
||||
// are assumed text and pass through. `sanitizeNullBytes` (called at PG
|
||||
// insert time) is the second-line defence against any text file that
|
||||
// turns out to have stray invalid-UTF-8 bytes.
|
||||
func isLikelyBinaryFilePath(path string) bool {
|
||||
ext := strings.ToLower(filepath.Ext(path))
|
||||
switch ext {
|
||||
case
|
||||
// images
|
||||
".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".ico", ".heic",
|
||||
// fonts
|
||||
".ttf", ".otf", ".woff", ".woff2", ".eot",
|
||||
// archives
|
||||
".zip", ".gz", ".tar", ".bz2", ".7z", ".rar",
|
||||
// documents (binary office)
|
||||
".pdf", ".docx", ".xlsx", ".pptx", ".doc", ".xls", ".ppt",
|
||||
// media
|
||||
".mp3", ".mp4", ".wav", ".avi", ".mov", ".webm", ".m4a", ".flac",
|
||||
// compiled / executable
|
||||
".exe", ".dll", ".so", ".dylib", ".class", ".jar", ".wasm",
|
||||
// db / cache
|
||||
".db", ".sqlite", ".sqlite3", ".pyc":
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// --- ClawHub types ---
|
||||
|
||||
type clawhubGetSkillResponse struct {
|
||||
|
||||
Reference in New Issue
Block a user