Compare commits

...

1 Commits

Author SHA1 Message Date
Jiang Bohan
894fe05c85 fix(sanitize): protect markdown code blocks from bluemonday HTML encoding
bluemonday.Sanitize() was applied to raw markdown, which corrupted code
blocks by encoding > to &gt;, < to &lt;, and stripping tag-like syntax
(e.g. Array<string> became Array). Now fenced and inline code blocks are
extracted before sanitization and restored after, preserving code content
while still stripping XSS from non-code regions.

Closes #704
2026-04-11 21:22:46 +08:00
2 changed files with 96 additions and 7 deletions

View File

@@ -1,7 +1,9 @@
package sanitize
import (
"fmt"
"regexp"
"strings"
"github.com/microcosm-cc/bluemonday"
)
@@ -11,11 +13,6 @@ var httpURL = regexp.MustCompile(`^https?://`)
// policy is a shared bluemonday policy that allows safe Markdown HTML while
// stripping dangerous elements (script, iframe, object, embed, style, on*).
//
// Note: bluemonday operates on raw text, so HTML inside Markdown code blocks
// (e.g. ```<script>```) will also be stripped. This is an acceptable trade-off
// for defense-in-depth — the primary sanitization happens in the frontend via
// rehype-sanitize which understands the Markdown AST.
var policy *bluemonday.Policy
func init() {
@@ -28,8 +25,44 @@ func init() {
policy.AllowAttrs("class").OnElements("code", "div", "span", "pre")
}
// fencedCodeBacktick matches ```-fenced code blocks (with optional language tag).
var fencedCodeBacktick = regexp.MustCompile("(?ms)^```[^\n]*\n.*?^```\\s*$")
// fencedCodeTilde matches ~~~-fenced code blocks (with optional language tag).
var fencedCodeTilde = regexp.MustCompile("(?ms)^~~~[^\n]*\n.*?^~~~\\s*$")
// inlineCodeDouble matches double-backtick inline code (e.g. ``code``).
var inlineCodeDouble = regexp.MustCompile("``[^`]+``")
// inlineCodeSingle matches single-backtick inline code (e.g. `code`).
var inlineCodeSingle = regexp.MustCompile("`[^`\n]+`")
// HTML sanitizes user-provided HTML/Markdown content, stripping dangerous
// tags (script, iframe, object, embed, etc.) and event-handler attributes.
//
// Code blocks (fenced and inline) are protected from bluemonday to prevent
// it from encoding HTML entities or stripping tag-like syntax in code.
func HTML(input string) string {
return policy.Sanitize(input)
var placeholders []string
replace := func(match string) string {
idx := len(placeholders)
placeholders = append(placeholders, match)
return fmt.Sprintf("\x00CODE_%d\x00", idx)
}
// Protect fenced code blocks first (higher priority), then inline code.
s := fencedCodeBacktick.ReplaceAllStringFunc(input, replace)
s = fencedCodeTilde.ReplaceAllStringFunc(s, replace)
s = inlineCodeDouble.ReplaceAllStringFunc(s, replace)
s = inlineCodeSingle.ReplaceAllStringFunc(s, replace)
s = policy.Sanitize(s)
// Restore code blocks.
for i, original := range placeholders {
s = strings.Replace(s, fmt.Sprintf("\x00CODE_%d\x00", i), original, 1)
}
return s
}

View File

@@ -80,13 +80,69 @@ func TestHTML(t *testing.T) {
input: `<div data-type="fileCard" data-href="http://example.com/file.pdf" data-filename="file.pdf"></div>`,
want: `<div data-type="fileCard" data-href="http://example.com/file.pdf" data-filename="file.pdf"></div>`,
},
// Code block protection tests (issue #704)
{
name: "fenced code block preserves angle brackets",
input: "```go\nfunc foo() <-chan int {\n\treturn make(chan int)\n}\n```",
want: "```go\nfunc foo() <-chan int {\n\treturn make(chan int)\n}\n```",
},
{
name: "fenced code block preserves generics",
input: "```typescript\nconst x: Array<string> = []\n```",
want: "```typescript\nconst x: Array<string> = []\n```",
},
{
name: "fenced code block preserves gt operator",
input: "```python\nif x > 0:\n print(x)\n```",
want: "```python\nif x > 0:\n print(x)\n```",
},
{
name: "fenced code block preserves HTML tags in code",
input: "```html\n<script>alert(1)</script>\n<div>hello</div>\n```",
want: "```html\n<script>alert(1)</script>\n<div>hello</div>\n```",
},
{
name: "inline code preserves angle brackets",
input: "Use `Array<string>` for typed arrays",
want: "Use `Array<string>` for typed arrays",
},
{
name: "inline code preserves gt operator",
input: "Check `x > 0` before proceeding",
want: "Check `x > 0` before proceeding",
},
{
name: "inline code preserves ampersand",
input: "Use `a & b` for bitwise AND",
want: "Use `a & b` for bitwise AND",
},
{
name: "double backtick inline code preserved",
input: "Use ``Map<string, List<int>>`` for nested generics",
want: "Use ``Map<string, List<int>>`` for nested generics",
},
{
name: "mixed code and XSS - code protected, XSS stripped",
input: "Use `x > 0` and <script>alert(1)</script> done",
want: "Use `x > 0` and done",
},
{
name: "tilde fenced code block preserved",
input: "~~~rust\nfn main() -> Result<(), Error> {}\n~~~",
want: "~~~rust\nfn main() -> Result<(), Error> {}\n~~~",
},
{
name: "multiple code blocks preserved",
input: "```go\na > b\n```\n\nSome text\n\n```ts\nx < y\n```",
want: "```go\na > b\n```\n\nSome text\n\n```ts\nx < y\n```",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := HTML(tt.input)
if got != tt.want {
t.Errorf("HTML(%q) = %q, want %q", tt.input, got, tt.want)
t.Errorf("HTML() =\n %q\nwant\n %q", got, tt.want)
}
})
}