fix(skills): harden import against invalid UTF-8 and binary files

PG rejects two byte patterns in a TEXT column. Both crashed real skill
imports we hit while assembling the template catalog:

- Embedded NUL (0x00) -> SQLSTATE 22021. Already stripped by
  sanitizeNullBytes, kept as-is.
- Other invalid UTF-8 (e.g. 0x91 — Windows-1252 smart quote in a skill
  whose author saved prose from Word). sanitizeNullBytes now also runs
  strings.ToValidUTF8 over the content so the second class no longer
  takes the whole import down.

For non-text payloads (images, fonts, archives, compiled binaries),
sanitization isn't the right fix — agents never read those as text,
and the bytes can't survive a TEXT column at all. addFile now skips
them by extension before the per-bundle cap counters tick, logging
the skip so an unexpected drop leaves a breadcrumb.

Function name kept for compatibility with the many call sites; both
behaviours are strict supersets of the original.

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Naiyuan Qing
2026-05-13 16:40:14 +08:00
parent f435dd2cd4
commit 6fafd86ecc

View File

@@ -19,11 +19,18 @@ import (
"github.com/multica-ai/multica/server/pkg/protocol"
)
// sanitizeNullBytes removes null bytes (0x00) from strings.
// PostgreSQL rejects null bytes in text columns with
// "invalid byte sequence for encoding UTF8: 0x00 (SQLSTATE 22021)".
// sanitizeNullBytes makes a string safe for a PostgreSQL TEXT column.
//
// Two failure modes covered:
// - Embedded NUL (0x00) — PG rejects with SQLSTATE 22021. Removed.
// - Other invalid-UTF-8 byte sequences (e.g. 0x91 = Windows-1252 smart
// quote, which crashed agent-template import of skills containing
// Windows-encoded prose). `strings.ToValidUTF8` drops them.
//
// Name is kept for compatibility with the many call sites; the behaviour
// is a strict superset of the original.
func sanitizeNullBytes(s string) string {
return strings.ReplaceAll(s, "\x00", "")
return strings.ToValidUTF8(strings.ReplaceAll(s, "\x00", ""), "")
}
// --- Response structs ---
@@ -481,7 +488,16 @@ func isCapError(err error) bool {
// addFile appends a supporting file while enforcing the per-bundle caps. It
// returns an error when either the file count or aggregate byte budget would
// be exceeded so the caller fails the import instead of silently truncating.
//
// Binary files (images, fonts, archives) are silently skipped: their bytes
// can't survive a PG TEXT column (SQLSTATE 22021), and they're reference
// assets the agent never reads as text anyway. Logging the skip leaves a
// breadcrumb if a user expected one of these to import.
func (s *importedSkill) addFile(path, content string) error {
if isLikelyBinaryFilePath(path) {
slog.Info("skill import: skipping binary file", "path", path, "size", len(content))
return nil
}
if len(s.files) >= maxImportFileCount {
return fmt.Errorf("%w: import bundle exceeds %d file limit", errImportCapExceeded, maxImportFileCount)
}
@@ -493,6 +509,34 @@ func (s *importedSkill) addFile(path, content string) error {
return nil
}
// isLikelyBinaryFilePath reports whether the file's extension indicates a
// non-text payload. Conservative blacklist — extensions not on the list
// are assumed text and pass through. `sanitizeNullBytes` (called at PG
// insert time) is the second-line defence against any text file that
// turns out to have stray invalid-UTF-8 bytes.
func isLikelyBinaryFilePath(path string) bool {
ext := strings.ToLower(filepath.Ext(path))
switch ext {
case
// images
".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".ico", ".heic",
// fonts
".ttf", ".otf", ".woff", ".woff2", ".eot",
// archives
".zip", ".gz", ".tar", ".bz2", ".7z", ".rar",
// documents (binary office)
".pdf", ".docx", ".xlsx", ".pptx", ".doc", ".xls", ".ppt",
// media
".mp3", ".mp4", ".wav", ".avi", ".mov", ".webm", ".m4a", ".flac",
// compiled / executable
".exe", ".dll", ".so", ".dylib", ".class", ".jar", ".wasm",
// db / cache
".db", ".sqlite", ".sqlite3", ".pyc":
return true
}
return false
}
// --- ClawHub types ---
type clawhubGetSkillResponse struct {