From 463a6caad82cd3de259b1135927722c71e9d80de Mon Sep 17 00:00:00 2001 From: Devon Rifkin Date: Tue, 19 Aug 2025 22:05:48 -0700 Subject: [PATCH] model: add bpe roundtripping tests --- model/bytepairencoding_test.go | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/model/bytepairencoding_test.go b/model/bytepairencoding_test.go index 7e310b56e5..71947be993 100644 --- a/model/bytepairencoding_test.go +++ b/model/bytepairencoding_test.go @@ -207,6 +207,36 @@ func TestLlama(t *testing.T) { } } }) + + t.Run("roundtriping 0x00-0xFF", func(t *testing.T) { + t.Parallel() + + for b := 0x00; b <= 0xFF; b++ { + input := string(rune(b)) + ids, err := tokenizer.Encode(input, false) + if err != nil { + t.Errorf("failed to encode rune 0x%02X: %v", b, err) + continue + } + + decoded, err := tokenizer.Decode(ids) + if err != nil { + t.Errorf("failed to decode rune 0x%02X: %v", b, err) + continue + } + + if b == 0x00 { + if len(decoded) != 0 { + t.Errorf("Decode(Encode(0x00)) should be empty, got %v", ids) + } + continue + } + + if decoded != input { + t.Errorf("rune 0x%02X failed roundtrip: got %q, want %q", b, decoded, input) + } + } + }) } func BenchmarkBytePairEncoding(b *testing.B) {