From 05982a95cb9e053fadf309e60ec9ff2bc58ba32e Mon Sep 17 00:00:00 2001 From: Grace <88872231+gr4ceG@users.noreply.github.com> Date: Mon, 13 Oct 2025 16:52:33 -0700 Subject: [PATCH] Qwen3VL Cloud Parser and Renderer (#12526) * working (other than tool call is the incorrect order) for tool calls and tools * Tests work, other than image tags (tests do not go through server) and tools (not in the correct order, but contents are the same) * testing for qwen3vl parser - toolparser is working * made changes to JSON tool parser, wraps the TollCallFunction with a TollCall object * Working parser for thinking models - assumes state of thinking, emits unambiguous content in thinking, does not call tool call in thinking * changed the parser to start with collecting content * thinking prefill * add hasThinkingSupport parameter to parser * qwen3-vl -> qwen3-vl-instruct for renderer/parser * Add hasThinkingSupport=false to QwenVLParser --------- Co-authored-by: Devon Rifkin --- api/types.go | 6 +- model/parsers/parsers.go | 3 + model/parsers/qwen3coder.go | 4 +- model/parsers/qwen3coder_test.go | 15 + model/parsers/qwen3vl.go | 228 +++++++ model/parsers/qwen3vl_nonthinking_test.go | 655 ++++++++++++++++++++ model/parsers/qwen3vl_thinking_test.go | 346 +++++++++++ model/renderers/qwen3coder.go | 4 +- model/renderers/qwen3coder_test.go | 2 +- model/renderers/qwen3vl.go | 166 +++++ model/renderers/qwen3vl_nonthinking_test.go | 497 +++++++++++++++ model/renderers/qwen3vl_test.go | 346 +++++++++++ model/renderers/qwen3vl_thinking_test.go | 372 +++++++++++ model/renderers/renderer.go | 24 +- openai/openai.go | 5 + server/prompt.go | 3 +- 16 files changed, 2654 insertions(+), 22 deletions(-) create mode 100644 model/parsers/qwen3vl.go create mode 100644 model/parsers/qwen3vl_nonthinking_test.go create mode 100644 model/parsers/qwen3vl_thinking_test.go create mode 100644 model/renderers/qwen3vl.go create mode 100644 model/renderers/qwen3vl_nonthinking_test.go create mode 100644 model/renderers/qwen3vl_test.go create mode 100644 model/renderers/qwen3vl_thinking_test.go diff --git a/api/types.go b/api/types.go index 41b490b512..85a4887764 100644 --- a/api/types.go +++ b/api/types.go @@ -266,9 +266,9 @@ func (pt PropertyType) String() string { type ToolProperty struct { AnyOf []ToolProperty `json:"anyOf,omitempty"` - Type PropertyType `json:"type"` + Type PropertyType `json:"type,omitempty"` Items any `json:"items,omitempty"` - Description string `json:"description"` + Description string `json:"description,omitempty"` Enum []any `json:"enum,omitempty"` } @@ -332,7 +332,7 @@ func (t *ToolFunctionParameters) String() string { type ToolFunction struct { Name string `json:"name"` - Description string `json:"description"` + Description string `json:"description,omitempty"` Parameters ToolFunctionParameters `json:"parameters"` } diff --git a/model/parsers/parsers.go b/model/parsers/parsers.go index a1d4e81271..040c2562af 100644 --- a/model/parsers/parsers.go +++ b/model/parsers/parsers.go @@ -21,6 +21,9 @@ func ParserForName(name string) Parser { case "qwen3-coder": parser := &Qwen3CoderParser{} return parser + case "qwen3-vl-instruct": + parser := &Qwen3VLParser{hasThinkingSupport: false} + return parser case "passthrough": return &PassthroughParser{} case "harmony": diff --git a/model/parsers/qwen3coder.go b/model/parsers/qwen3coder.go index f44d7c8efd..84636bdc6a 100644 --- a/model/parsers/qwen3coder.go +++ b/model/parsers/qwen3coder.go @@ -150,7 +150,9 @@ func eat(p *Qwen3CoderParser) ([]qwenEvent, bool) { ambiguous := p.acc.String()[ambiguousStart:] p.acc.Reset() p.acc.WriteString(ambiguous) - events = append(events, qwenEventContent{content: unambiguous}) + if len(unambiguous) > 0 { + events = append(events, qwenEventContent{content: unambiguous}) + } return events, false } else { // we found content that is entirely not a tool call. We should withhold diff --git a/model/parsers/qwen3coder_test.go b/model/parsers/qwen3coder_test.go index c77fe2d95f..3756b4bac7 100644 --- a/model/parsers/qwen3coder_test.go +++ b/model/parsers/qwen3coder_test.go @@ -103,6 +103,21 @@ func TestQwenParserStreaming(t *testing.T) { }, }, }, + { + desc: "unambiguous empty: partial tool open at buffer start", + steps: []step{ + { + input: "abc", + wantEvents: []qwenEvent{ + qwenEventRawToolCall{raw: "abc"}, + }, + }, + }, + }, { desc: "trailing whitespace between tool call and content", steps: []step{ diff --git a/model/parsers/qwen3vl.go b/model/parsers/qwen3vl.go new file mode 100644 index 0000000000..965e324643 --- /dev/null +++ b/model/parsers/qwen3vl.go @@ -0,0 +1,228 @@ +package parsers + +import ( + "context" + "encoding/json" + "log/slog" + "strings" + "unicode" + + "github.com/ollama/ollama/api" + "github.com/ollama/ollama/logutil" +) + +// TODO: call the init function +const ( + CollectingThinkingContent qwenParserState = iota + CollectingContent + CollectingToolContent +) + +const ( + thinkingCloseTag = "" +) + +// TODO(gguo): add a field for isThinking +type Qwen3VLParser struct { + state qwenParserState + buffer strings.Builder + tools []api.Tool + hasThinkingSupport bool +} + +func (p *Qwen3VLParser) HasToolSupport() bool { + return true +} + +// TODO(gguo): changes this to reference an objects param +func (p *Qwen3VLParser) HasThinkingSupport() bool { + return p.hasThinkingSupport +} + +func (p *Qwen3VLParser) initialState() qwenParserState { + if p.HasThinkingSupport() { // has thinking, start from collecting thinking content + return CollectingThinkingContent + } + return CollectingContent +} + +func (p *Qwen3VLParser) Init(tools []api.Tool, lastMessage *api.Message) []api.Tool { + p.tools = tools + p.state = p.initialState() + return tools +} + +type qwenEventThinkingContent struct { + content string +} + +func (qwenEventThinkingContent) isQwenEvent() {} + +func (p *Qwen3VLParser) Add(s string, done bool) (content string, thinking string, calls []api.ToolCall, err error) { + p.buffer.WriteString(s) + events := p.parseEvents() + + var toolCalls []api.ToolCall + var sb strings.Builder + for _, event := range events { + switch event := event.(type) { + case qwenEventRawToolCall: + toolCall, err := parseJSONToolCall(event, p.tools) + if err != nil { + slog.Warn("qwen tool call parsing failed", "error", err) + return "", "", nil, err + } + toolCalls = append(toolCalls, toolCall) + case qwenEventThinkingContent: + sb.WriteString(event.content) + case qwenEventContent: + // TODO(drifkin): if the same turn contains multiple interleaved content + // events, we naively append them together here. + sb.WriteString(event.content) + } + } + + return sb.String(), "", toolCalls, nil +} + +func (p *Qwen3VLParser) parseEvents() []qwenEvent { + var all []qwenEvent + + keepLooping := true + for keepLooping { + var events []qwenEvent + events, keepLooping = p.eat() + if len(events) > 0 { + all = append(all, events...) + } + } + + if len(all) > 0 { + slog.Log(context.TODO(), logutil.LevelTrace, "qwen events parsed", "events", all, "state", p.state, "buffer", p.buffer.String()) + } + + return all +} + +func emitContentBeforeTag(p *Qwen3VLParser, events []qwenEvent, tag string) []qwenEvent { + split := strings.SplitN(p.buffer.String(), tag, 2) + before := split[0] + before = strings.TrimRightFunc(before, unicode.IsSpace) + if len(before) > 0 { + events = append(events, qwenEventContent{content: before}) + } + after := split[1] + p.buffer.Reset() + p.buffer.WriteString(after) + return events +} + +func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) { + var events []qwenEvent + + switch p.state { + case CollectingContent: + if strings.Contains(p.buffer.String(), toolOpenTag) { + events = emitContentBeforeTag(p, events, toolOpenTag) + p.state = CollectingToolContent + return events, true + } else if overlapLen := overlap(p.buffer.String(), toolOpenTag); overlapLen > 0 { + beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen] + trailingWhitespaceLen := trailingWhitespaceLen(beforePartialTag) + ambiguousStart := len(beforePartialTag) - trailingWhitespaceLen + + unambiguous := p.buffer.String()[:ambiguousStart] + ambiguous := p.buffer.String()[ambiguousStart:] + p.buffer.Reset() + p.buffer.WriteString(ambiguous) + if len(unambiguous) > 0 { + events = append(events, qwenEventContent{content: unambiguous}) + } + return events, false + } else { + whitespaceLen := trailingWhitespaceLen(p.buffer.String()) + ambiguousStart := len(p.buffer.String()) - whitespaceLen + + unambiguous := p.buffer.String()[:ambiguousStart] + ambiguous := p.buffer.String()[ambiguousStart:] + p.buffer.Reset() + p.buffer.WriteString(ambiguous) + if len(unambiguous) > 0 { + events = append(events, qwenEventContent{content: unambiguous}) + } + return events, false + } + case CollectingToolContent: + if strings.Contains(p.buffer.String(), toolCloseTag) { + split := strings.SplitN(p.buffer.String(), toolCloseTag, 2) + before := split[0] + if len(before) == 0 { + slog.Warn("qwen tool call closing tag found but no content before it") + } + + after := strings.TrimLeftFunc(split[1], unicode.IsSpace) + events = append(events, qwenEventRawToolCall{raw: before}) + p.buffer.Reset() + p.buffer.WriteString(after) + p.state = CollectingContent + return events, true + } else { + return events, false + } + case CollectingThinkingContent: // so we want to hip the unambiguous stuff + if strings.Contains(p.buffer.String(), thinkingCloseTag) { + split := strings.SplitN(p.buffer.String(), thinkingCloseTag, 2) + before := split[0] + if len(before) == 0 { + slog.Warn("qwen tool call closing tag found but no content before it") + } + after := strings.TrimLeftFunc(split[1], unicode.IsSpace) + if len(before) > 0 { + events = append(events, qwenEventThinkingContent{content: before}) + } + p.buffer.Reset() + p.buffer.WriteString(after) + p.state = CollectingContent + return events, true + } else if overlapLen := overlap(p.buffer.String(), thinkingCloseTag); overlapLen > 0 { // we see part of a close thinking tag + beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen] + trailingWhitespaceLen := trailingWhitespaceLen(beforePartialTag) + ambiguousStart := len(beforePartialTag) - trailingWhitespaceLen + + unambiguous := p.buffer.String()[:ambiguousStart] + ambiguous := p.buffer.String()[ambiguousStart:] + p.buffer.Reset() + p.buffer.WriteString(ambiguous) + if len(unambiguous) > 0 { + events = append(events, qwenEventThinkingContent{content: unambiguous}) + } + return events, false + } else { + whitespaceLen := trailingWhitespaceLen(p.buffer.String()) + ambiguousStart := len(p.buffer.String()) - whitespaceLen + + unambiguous := p.buffer.String()[:ambiguousStart] + ambiguous := p.buffer.String()[ambiguousStart:] + p.buffer.Reset() + p.buffer.WriteString(ambiguous) + if len(unambiguous) > 0 { + events = append(events, qwenEventThinkingContent{content: unambiguous}) + } + return events, false + } + default: + panic("unreachable") + } +} + +func parseJSONToolCall(raw qwenEventRawToolCall, tools []api.Tool) (api.ToolCall, error) { + var toolCallFunction api.ToolCallFunction + if err := json.Unmarshal([]byte(raw.raw), &toolCallFunction); err != nil { + return api.ToolCall{}, err + } + + toolCall := api.ToolCall{} + toolCall.Function = toolCallFunction + + return toolCall, nil +} diff --git a/model/parsers/qwen3vl_nonthinking_test.go b/model/parsers/qwen3vl_nonthinking_test.go new file mode 100644 index 0000000000..743929465c --- /dev/null +++ b/model/parsers/qwen3vl_nonthinking_test.go @@ -0,0 +1,655 @@ +package parsers + +import ( + "reflect" + "testing" + + "github.com/ollama/ollama/api" +) + +func TestQwen3VLNonThinkingParserStreaming(t *testing.T) { + type step struct { + input string + wantEvents []qwenEvent + } + + cases := []struct { + desc string + steps []step + only bool + }{ + { + desc: "simple thinking", + steps: []step{ + {input: "abc", wantEvents: []qwenEvent{qwenEventContent{content: "abc"}}}, + }, + }, + { + desc: "simple trip thinking", + steps: []step{ + {input: "abc", wantEvents: []qwenEvent{qwenEventContent{content: "abc"}}}, + }, + }, + { + desc: "thinking with split tags", + steps: []step{ + {input: "abc", wantEvents: []qwenEvent{qwenEventContent{content: "abc"}}}, + {input: "", wantEvents: []qwenEvent{qwenEventContent{content: ""}}}, + }, + }, + { + desc: "multiple think tags", + steps: []step{ + {input: "abcactually, is not thinking", wantEvents: []qwenEvent{qwenEventContent{content: "abcactually, is not thinking"}}}, + }, + }, + { + desc: "thinking and tool call", + steps: []step{ + { + input: "I'm thinkingI'm tool calling", + wantEvents: []qwenEvent{ + qwenEventContent{content: "I'm thinking"}, + qwenEventRawToolCall{raw: "I'm tool calling"}, + }, + }, + }, + }, + { + desc: "nested thinking (outside thinking, inside thinking)", + steps: []step{ + { + input: "I'm thinkingI'm nested thinking", + wantEvents: []qwenEvent{ + qwenEventContent{content: "I'm thinkingI'm nested thinking"}, + }, + }, + }, + }, + { + desc: "interleaved thinking", + steps: []step{ + { + input: "I'm thinkingI'm actually content", + wantEvents: []qwenEvent{ + qwenEventContent{content: "I'm thinkingI'm actually content"}, + }, + }, + }, + }, + { + desc: "nested thinking and tool call (outside thinking, inside tool call)", + steps: []step{ + { + input: "I'm thinkingI'm nested tool call", + wantEvents: []qwenEvent{ + qwenEventContent{content: "I'm thinking"}, + qwenEventRawToolCall{raw: "I'm nested tool call"}, + qwenEventContent{content: ""}, + }, + }, + }, + }, + { + desc: "nested thinking and tool call (outside tool call, inside thinking)", + steps: []step{ + { + input: "I'm nested tool callI'm thinking", + wantEvents: []qwenEvent{ + qwenEventRawToolCall{raw: "I'm nested tool callI'm thinking"}, + }, + }, + }, + }, + { + desc: "interleaved thinking and tool call", + steps: []step{ + { + input: "I'm thinkingI'm NOT a nested tool callI'm nested tool call 2", + wantEvents: []qwenEvent{ + qwenEventContent{content: "I'm thinking"}, + qwenEventRawToolCall{raw: "I'm NOT a nested tool call"}, + qwenEventRawToolCall{raw: "I'm nested tool call 2"}, + qwenEventContent{content: ""}, + }, + }, + }, + }, + { + desc: "emit unambiguous before partial tool open (trailing ws)", + steps: []step{ + { + input: "abc\u00a0\nabc", + wantEvents: []qwenEvent{ + qwenEventRawToolCall{raw: "abc"}, + }, + }, + }, + }, + { + desc: "partial thinking tag fakeout", + steps: []step{ + { + input: "abcunfinished<", // when something is ambiguious, we dont emit anything + wantEvents: []qwenEvent{qwenEventContent{content: "abcunfinished"}}, + }, + }, + }, + { + desc: "test with split tool and content", + steps: []step{ + { + input: "abcunfinished def", + wantEvents: []qwenEvent{ + qwenEventRawToolCall{raw: "unfinished"}, + qwenEventContent{content: "def"}, + }, + }, + }, + }, + } + anyOnlies := false + for _, tc := range cases { + if tc.only { + anyOnlies = true + } + } + + for _, tc := range cases { + if anyOnlies && !tc.only { + continue + } + + t.Run(tc.desc, func(t *testing.T) { + parser := Qwen3VLParser{hasThinkingSupport: false} + parser.Init([]api.Tool{}, nil) + + for i, step := range tc.steps { + parser.buffer.WriteString(step.input) + gotEvents := parser.parseEvents() + + if len(gotEvents) == 0 && len(step.wantEvents) == 0 { + // avoid deep equal on empty vs. nil slices + continue + } + + if !reflect.DeepEqual(gotEvents, step.wantEvents) { + t.Errorf("step %d: input %q: got events %#v, want %#v", i, step.input, gotEvents, step.wantEvents) + } + } + }) + } +} + +func TestQwenOldParserStreaming(t *testing.T) { + type step struct { + input string + wantEvents []qwenEvent + } + + cases := []struct { + desc string + steps []step + only bool + }{ + { + desc: "simple message streamed word by word", + steps: []step{ + { + input: "hi", + wantEvents: []qwenEvent{qwenEventContent{content: "hi"}}, + }, + { + input: " there", + wantEvents: []qwenEvent{qwenEventContent{content: " there"}}, + }, + }, + }, + { + desc: "content before tool call", + steps: []step{ + { + input: "hi there", + wantEvents: []qwenEvent{qwenEventContent{content: "hi there"}}, + }, + }, + }, + { + desc: "multiple tool calls in one message", + steps: []step{ + { + input: "before1in tool callafter1in tool call 2after2", + wantEvents: []qwenEvent{ + qwenEventContent{content: "before1"}, + qwenEventRawToolCall{raw: "in tool call"}, + qwenEventContent{content: "after1"}, + qwenEventRawToolCall{raw: "in tool call 2"}, + qwenEventContent{content: "after2"}, + }, + }, + }, + }, + { + desc: "tool calls with split tags", + steps: []step{ + { + input: "beforein tool callaf", + wantEvents: []qwenEvent{ + qwenEventRawToolCall{raw: "in tool call"}, + qwenEventContent{content: "af"}, + }, + }, + { + input: "ter", + wantEvents: []qwenEvent{ + qwenEventContent{content: "ter"}, + }, + }, + }, + }, + { + desc: "trailing whitespace between content and tool call", + steps: []step{ + { + input: "abc\ndef", + wantEvents: []qwenEvent{ + qwenEventContent{content: "abc"}, + qwenEventRawToolCall{raw: "def"}, + }, + }, + }, + }, + { + desc: "trailing whitespace between tool call and content", + steps: []step{ + { + input: "abc\ndef", + wantEvents: []qwenEvent{ + qwenEventRawToolCall{raw: "abc"}, + qwenEventContent{content: "def"}, + }, + }, + }, + }, + { + desc: "empty content before tool call", + steps: []step{ + { + input: "\nabc", + wantEvents: []qwenEvent{ + qwenEventRawToolCall{raw: "abc"}, + }, + }, + }, + }, + { + desc: "partial tool open tag fakeout", + steps: []step{ + { + input: "abc\ntestู…ุฑุญุจุง", + wantEvents: []qwenEvent{ + qwenEventContent{content: "ไฝ ๅฅฝ ๐ŸŒ"}, + qwenEventRawToolCall{raw: "test"}, + qwenEventContent{content: "ู…ุฑุญุจุง"}, + }, + }, + }, + }, + { + desc: "arabic text handling", + steps: []step{ + { + input: "ู…ุฑุญุจุง ุจุงู„ุนุงู„ู…", + wantEvents: []qwenEvent{qwenEventContent{content: "ู…ุฑุญุจุง ุจุงู„ุนุงู„ู…"}}, + }, + }, + }, + { + desc: "emoji passthrough", + steps: []step{ + { + input: "โœ…", + wantEvents: []qwenEvent{qwenEventContent{content: "โœ…"}}, + }, + }, + }, + { + desc: "emoji after tool call", + steps: []step{ + { + input: "testๅฎŒๆˆ โœ…", + wantEvents: []qwenEvent{ + qwenEventRawToolCall{raw: "test"}, + qwenEventContent{content: "ๅฎŒๆˆ โœ…"}, + }, + }, + }, + }, + { + desc: "unicode streaming with whitespace handling", + steps: []step{ + { + input: "ู…ุฑุญุจุง", + wantEvents: []qwenEvent{ + qwenEventContent{content: "ู…ุฑุญุจุง"}, + }, + }, + { + input: " \n", + wantEvents: []qwenEvent{}, + }, + { + input: "ไธ–็•Œ", + wantEvents: []qwenEvent{ + qwenEventContent{content: " \nไธ–็•Œ"}, + }, + }, + }, + }, + { + desc: "non-breaking space withheld across chunks", + steps: []step{ + { + input: "Hello\u00a0", + wantEvents: []qwenEvent{ + qwenEventContent{content: "Hello"}, + }, + }, + { + input: "world", + wantEvents: []qwenEvent{ + qwenEventContent{content: "\u00a0world"}, + }, + }, + }, + }, + { + desc: "ideographic space before partial tool", + steps: []step{ + { + input: "Hello\u3000abc", + wantEvents: []qwenEvent{}, + }, + { + input: "def", + wantEvents: []qwenEvent{ + qwenEventRawToolCall{raw: "abc"}, + qwenEventContent{content: "def"}, + }, + }, + }, + }, + { + desc: "ideographic space before partial tool fakeout", + steps: []step{ + { + input: "Hello\u3000abc", + wantEvents: []qwenEvent{ + qwenEventContent{content: "\u3000abc"}, + }, + }, + }, + }, + { + desc: "unicode with partial tool tag", + steps: []step{ + { + input: "ๆต‹่ฏ•๐ŸŽฏ b and a < b\""}}`, + wantToolCall: api.ToolCall{ + Function: api.ToolCallFunction{ + Name: "exec", + Arguments: map[string]any{ + "command": "ls && echo \"a > b and a < b\"", + }, + }, + }, + }, + { + name: "unicode in function names and parameters", + tools: []api.Tool{}, + rawToolCall: `{"name": "่Žทๅ–ๅคฉๆฐ”", "arguments": {"ๅŸŽๅธ‚": "ๅŒ—ไบฌ", "message": "Hello! ไฝ ๅฅฝ! ๐ŸŒŸ ู…ุฑุญุจุง"}}`, + wantToolCall: api.ToolCall{ + Function: api.ToolCallFunction{ + Name: "่Žทๅ–ๅคฉๆฐ”", + Arguments: map[string]any{ + "ๅŸŽๅธ‚": "ๅŒ—ไบฌ", + "message": "Hello! ไฝ ๅฅฝ! ๐ŸŒŸ ู…ุฑุญุจุง", + }, + }, + }, + }, + } + + for i, step := range steps { + gotToolCall, err := parseJSONToolCall(qwenEventRawToolCall{raw: step.rawToolCall}, step.tools) + if err != nil { + t.Errorf("step %d (%s): %v", i, step.name, err) + } + if !reflect.DeepEqual(gotToolCall, step.wantToolCall) { + t.Errorf("step %d (%s): got tool call %#v, want %#v", i, step.name, gotToolCall, step.wantToolCall) + } + } +} diff --git a/model/parsers/qwen3vl_thinking_test.go b/model/parsers/qwen3vl_thinking_test.go new file mode 100644 index 0000000000..a94344a4f4 --- /dev/null +++ b/model/parsers/qwen3vl_thinking_test.go @@ -0,0 +1,346 @@ +package parsers + +import ( + "reflect" + "testing" + + "github.com/ollama/ollama/api" +) + +func TestQwen3VLThinkingParserStreaming(t *testing.T) { + type step struct { + input string + wantEvents []qwenEvent + } + + cases := []struct { + desc string + steps []step + only bool + }{ + { + desc: "simple thinking", + steps: []step{ + {input: "abc", wantEvents: []qwenEvent{qwenEventThinkingContent{content: "abc"}}}, + }, + }, + { + desc: "simple trip thinking", + steps: []step{ + {input: "abc", wantEvents: []qwenEvent{qwenEventThinkingContent{content: "abc"}}}, + }, + }, + { + desc: "thinking with split tags", + steps: []step{ + {input: "abc", wantEvents: []qwenEvent{qwenEventThinkingContent{content: "abc"}}}, + {input: "", wantEvents: []qwenEvent{}}, + }, + }, + { + desc: "multiple think tags", + steps: []step{ + {input: "abcactually, is not thinking", wantEvents: []qwenEvent{qwenEventThinkingContent{content: "abcactually, is not thinking"}}}, + }, + }, + { + desc: "thinking and tool call", + steps: []step{ + { + input: "I'm thinkingI'm tool calling", + wantEvents: []qwenEvent{ + qwenEventThinkingContent{content: "I'm thinking"}, + qwenEventRawToolCall{raw: "I'm tool calling"}, + }, + }, + }, + }, + { + desc: "thinking and content", + steps: []step{ + { + input: "I'm thinkingI'm content", + wantEvents: []qwenEvent{ + qwenEventThinkingContent{content: "I'm thinking"}, + qwenEventContent{content: "I'm content"}, + }, + }, + }, + }, + { + desc: "thinking and tool call and content", + }, + { + desc: "nested thinking (outside thinking, inside thinking)", + steps: []step{ + { + input: "I'm thinkingI'm nested thinking", + wantEvents: []qwenEvent{ + qwenEventThinkingContent{content: "I'm thinkingI'm nested thinking"}, + qwenEventContent{content: ""}, + }, + }, + }, + }, + { + desc: "interleaved thinking", + steps: []step{ + { + input: "I'm thinkingI'm actually content", + wantEvents: []qwenEvent{ + qwenEventThinkingContent{content: "I'm thinking"}, + qwenEventContent{content: "I'm actually content"}, + }, + }, + }, + }, + { + desc: "nested thinking and tool call (outside thinking, inside tool call)", + steps: []step{ + { + input: "I'm thinkingI'm nested tool call", + wantEvents: []qwenEvent{qwenEventThinkingContent{content: "I'm thinkingI'm nested tool call"}}, + }, + }, + }, + { + desc: "nested thinking and tool call (outside tool call, inside thinking)", + steps: []step{ + { + input: "I'm nested tool callI'm thinking", + wantEvents: []qwenEvent{ + qwenEventThinkingContent{content: "I'm nested tool callI'm thinking"}, + qwenEventContent{content: ""}, + }, + }, + }, + }, + { + desc: "interleaved thinking and tool call", + steps: []step{ + { + input: "I'm thinkingI'm NOT a nested tool callI'm nested tool call 2", + wantEvents: []qwenEvent{ + qwenEventThinkingContent{content: "I'm thinkingI'm NOT a nested tool call"}, + qwenEventContent{content: ""}, + qwenEventRawToolCall{raw: "I'm nested tool call 2"}, + qwenEventContent{content: ""}, + }, + }, + }, + }, + { + desc: "partial thinking tag fakeout", + steps: []step{ + { + input: "abcunfinishedunfinished"}}, + }, + }, + }, + { + desc: "test with split thinking and content", + steps: []step{ + { + input: "abcunfinishedunfinished"}}, + }, + { + input: "ink> def", + wantEvents: []qwenEvent{ + qwenEventContent{content: "def"}, + }, + }, + }, + }, + { + desc: "thinking with no tags", + steps: []step{ + { + input: "Hello I am thinking", + wantEvents: []qwenEvent{ + qwenEventThinkingContent{content: "Hello I am thinking"}, + }, + }, + { + input: "Hello I am thinking some more", + wantEvents: []qwenEvent{ + qwenEventThinkingContent{content: "Hello I am thinking some more"}, + }, + }, + { + input: "Hello I am think NOT", + wantEvents: []qwenEvent{ + qwenEventThinkingContent{content: "Hello I am think"}, + qwenEventContent{content: "NOT"}, + }, + }, + }, + }, + } + anyOnlies := false + for _, tc := range cases { + if tc.only { + anyOnlies = true + } + } + + for _, tc := range cases { + if anyOnlies && !tc.only { + continue + } + + t.Run(tc.desc, func(t *testing.T) { + parser := Qwen3VLParser{hasThinkingSupport: true} + parser.Init([]api.Tool{}, nil) + // parser.state = CollectingThinkingContent + + for i, step := range tc.steps { + parser.buffer.WriteString(step.input) + gotEvents := parser.parseEvents() + + if len(gotEvents) == 0 && len(step.wantEvents) == 0 { + // avoid deep equal on empty vs. nil slices + continue + } + + if !reflect.DeepEqual(gotEvents, step.wantEvents) { + t.Errorf("step %d: input %q: got events %#v, want %#v", i, step.input, gotEvents, step.wantEvents) + } + } + }) + } +} + +func TestQwen3VLThinkingToolParser(t *testing.T) { + type step struct { + name string + rawToolCall string + tools []api.Tool + wantToolCall api.ToolCall + } + + steps := []step{ + { + name: "simple tool call", + tools: []api.Tool{}, + rawToolCall: `{"name": "get-current-weather", "arguments": {"location": "San Francisco, CA", "unit": "fahrenheit"}}`, + wantToolCall: api.ToolCall{ + Function: api.ToolCallFunction{ + Name: "get-current-weather", + Arguments: map[string]any{ + "location": "San Francisco, CA", + "unit": "fahrenheit", + }, + }, + }, + }, + { + name: "names with spaces", + tools: []api.Tool{}, + rawToolCall: `{"name": "get current temperature", "arguments": {"location with spaces": "San Francisco", "unit with spaces": "celsius"}}`, + wantToolCall: api.ToolCall{ + Function: api.ToolCallFunction{ + Name: "get current temperature", + Arguments: map[string]any{ + "location with spaces": "San Francisco", + "unit with spaces": "celsius", + }, + }, + }, + }, + { + name: "names with quotes", + tools: []api.Tool{}, + rawToolCall: `{"name": "\"get current temperature\"", "arguments": {"\"location with spaces\"": "San Francisco", "\"unit with spaces\"": "\"celsius\""}}`, + wantToolCall: api.ToolCall{ + Function: api.ToolCallFunction{ + Name: "\"get current temperature\"", + Arguments: map[string]any{ + "\"location with spaces\"": "San Francisco", + "\"unit with spaces\"": "\"celsius\"", + }, + }, + }, + }, + { + name: "tool call with typed parameters (json types)", + tools: []api.Tool{}, + rawToolCall: `{"name": "calculate", "arguments": {"x": 3.14, "y": 42, "enabled": true, "items": ["a", "b", "c"]}}`, + wantToolCall: api.ToolCall{ + Function: api.ToolCallFunction{ + Name: "calculate", + Arguments: map[string]any{ + "x": 3.14, + "y": float64(42), + "enabled": true, + "items": []any{"a", "b", "c"}, + }, + }, + }, + }, + { + name: "ampersands in parameter values", + tools: []api.Tool{}, + rawToolCall: `{"name": "exec", "arguments": {"command": "ls && echo \"done\""}}`, + wantToolCall: api.ToolCall{ + Function: api.ToolCallFunction{ + Name: "exec", + Arguments: map[string]any{ + "command": "ls && echo \"done\"", + }, + }, + }, + }, + { + name: "angle brackets in parameter values", + tools: []api.Tool{}, + rawToolCall: `{"name": "exec", "arguments": {"command": "ls && echo \"a > b and a < b\""}}`, + wantToolCall: api.ToolCall{ + Function: api.ToolCallFunction{ + Name: "exec", + Arguments: map[string]any{ + "command": "ls && echo \"a > b and a < b\"", + }, + }, + }, + }, + { + name: "unicode in function names and parameters", + tools: []api.Tool{}, + rawToolCall: `{"name": "่Žทๅ–ๅคฉๆฐ”", "arguments": {"ๅŸŽๅธ‚": "ๅŒ—ไบฌ", "message": "Hello! ไฝ ๅฅฝ! ๐ŸŒŸ ู…ุฑุญุจุง"}}`, + wantToolCall: api.ToolCall{ + Function: api.ToolCallFunction{ + Name: "่Žทๅ–ๅคฉๆฐ”", + Arguments: map[string]any{ + "ๅŸŽๅธ‚": "ๅŒ—ไบฌ", + "message": "Hello! ไฝ ๅฅฝ! ๐ŸŒŸ ู…ุฑุญุจุง", + }, + }, + }, + }, + } + + for i, step := range steps { + gotToolCall, err := parseJSONToolCall(qwenEventRawToolCall{raw: step.rawToolCall}, step.tools) + if err != nil { + t.Errorf("step %d (%s): %v", i, step.name, err) + } + if !reflect.DeepEqual(gotToolCall, step.wantToolCall) { + t.Errorf("step %d (%s): got tool call %#v, want %#v", i, step.name, gotToolCall, step.wantToolCall) + } + } +} diff --git a/model/renderers/qwen3coder.go b/model/renderers/qwen3coder.go index 32611791b8..18853019c1 100644 --- a/model/renderers/qwen3coder.go +++ b/model/renderers/qwen3coder.go @@ -55,7 +55,9 @@ func renderAdditionalKeys(obj any, handledKeys map[string]bool) string { return sb.String() } -func Qwen3CoderRenderer(messages []api.Message, tools []api.Tool, _ *api.ThinkValue) (string, error) { +type Qwen3CoderRenderer struct{} + +func (r *Qwen3CoderRenderer) Render(messages []api.Message, tools []api.Tool, _ *api.ThinkValue) (string, error) { var sb strings.Builder // filter out system messages and choose the first (if any) to win diff --git a/model/renderers/qwen3coder_test.go b/model/renderers/qwen3coder_test.go index 6a9e5eccd2..1addee9e18 100644 --- a/model/renderers/qwen3coder_test.go +++ b/model/renderers/qwen3coder_test.go @@ -288,7 +288,7 @@ call tool<|im_end|> } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - rendered, err := Qwen3CoderRenderer(tt.msgs, tt.tools, nil) + rendered, err := (&Qwen3CoderRenderer{}).Render(tt.msgs, tt.tools, nil) if err != nil { t.Fatal(err) } diff --git a/model/renderers/qwen3vl.go b/model/renderers/qwen3vl.go new file mode 100644 index 0000000000..7e49fea184 --- /dev/null +++ b/model/renderers/qwen3vl.go @@ -0,0 +1,166 @@ +package renderers + +import ( + "encoding/json" + "strings" + + "github.com/ollama/ollama/api" +) + +func marshalWithSpaces(v any) ([]byte, error) { + b, err := json.Marshal(v) + if err != nil { + return nil, err + } + + out := make([]byte, 0, len(b)+len(b)/8) + inStr, esc := false, false + for _, c := range b { + if inStr { + out = append(out, c) + if esc { + esc = false + continue + } + if c == '\\' { + esc = true + continue + } + if c == '"' { + inStr = false + } + continue + } + switch c { + case '"': + inStr = true + out = append(out, c) + case ':': + out = append(out, ':', ' ') + case ',': + out = append(out, ',', ' ') + default: + out = append(out, c) + } + } + return out, nil +} + +type Qwen3VLRenderer struct { + isThinking bool +} + +func (r *Qwen3VLRenderer) renderContent(content api.Message, doVisionCount bool) string { + // This assumes all images are at the front of the message - same assumption as ollama/ollama/runner.go + var subSb strings.Builder + for range content.Images { + subSb.WriteString("<|vision_start|><|image_pad|><|vision_end|>") + } + // TODO: support videos + + subSb.WriteString(content.Content) + return subSb.String() +} + +func (r *Qwen3VLRenderer) Render(messages []api.Message, tools []api.Tool, _ *api.ThinkValue) (string, error) { + var sb strings.Builder + + if len(tools) > 0 { + sb.WriteString(imStartTag + "system\n") + if len(messages) > 0 && messages[0].Role == "system" { + sb.WriteString(messages[0].Content + "\n\n") + } + sb.WriteString("# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n") + for _, tool := range tools { + sb.WriteString("\n") + if b, err := marshalWithSpaces(tool); err == nil { + sb.Write(b) + } + } + sb.WriteString("\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n") + } else if len(messages) > 0 && messages[0].Role == "system" { + sb.WriteString("<|im_start|>system\n" + messages[0].Content + "<|im_end|>\n") + } + multiStepTool := true + lastQueryIndex := len(messages) - 1 // so this is the last user message + + for i := len(messages) - 1; i >= 0; i-- { + message := messages[i] + if multiStepTool && message.Role == "user" { + // Check if content starts with and ends with + content := r.renderContent(message, true) + if !(strings.HasPrefix(content, "") && strings.HasSuffix(content, "")) { + multiStepTool = false + lastQueryIndex = i + } + } + } + + for i, message := range messages { + content := r.renderContent(message, true) + + lastMessage := i == len(messages)-1 + prefill := lastMessage && message.Role == "assistant" + + if message.Role == "user" || message.Role == "system" && i != 0 { + sb.WriteString("<|im_start|>" + message.Role + "\n" + content + "<|im_end|>\n") + } else if message.Role == "assistant" { + contentReasoning := "" + + if r.isThinking { + if message.Thinking != "" { + contentReasoning = message.Thinking + } + } + + if r.isThinking && i > lastQueryIndex { + if i == len(messages)-1 || contentReasoning != "" { + sb.WriteString("<|im_start|>" + message.Role + "\n\n" + strings.Trim(contentReasoning, "\n")) // do we want to add a new line here? + if content != "" { + sb.WriteString("\n\n\n" + strings.TrimLeft(content, "\n")) + } + } else { + sb.WriteString("<|im_start|>" + message.Role + "\n" + content) + } + } else { + sb.WriteString("<|im_start|>" + message.Role + "\n" + content) + } + + if len(message.ToolCalls) > 0 { + for j, toolCall := range message.ToolCalls { + if j > 0 || content != "" { + sb.WriteString("\n") + } + + sb.WriteString("\n{\"name\": \"" + toolCall.Function.Name + "\", \"arguments\": ") + if b, err := marshalWithSpaces(toolCall.Function.Arguments); err == nil { + sb.Write(b) + } + sb.WriteString("}\n") + } + } + + if !prefill { + sb.WriteString("<|im_end|>\n") + } + } else if message.Role == "tool" { + if i == 0 || messages[i-1].Role != "tool" { + sb.WriteString("<|im_start|>user") + } + sb.WriteString("\n\n" + message.Content + "\n") + if i == len(messages)-1 || messages[i+1].Role != "tool" { + sb.WriteString("<|im_end|>\n") + } + } + + // prefill at the end + if lastMessage && !prefill { + sb.WriteString("<|im_start|>assistant\n") + if r.isThinking { + sb.WriteString("\n") + } + } + } + + return sb.String(), nil +} diff --git a/model/renderers/qwen3vl_nonthinking_test.go b/model/renderers/qwen3vl_nonthinking_test.go new file mode 100644 index 0000000000..3f50a965ee --- /dev/null +++ b/model/renderers/qwen3vl_nonthinking_test.go @@ -0,0 +1,497 @@ +package renderers + +import ( + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/ollama/ollama/api" +) + +func TestQwen3VLNonThinkingRenderer(t *testing.T) { + tests := []struct { + name string + msgs []api.Message + images []api.ImageData + tools []api.Tool + expected string + }{ + { + name: "prefill", + msgs: []api.Message{ + {Role: "system", Content: "You are a helpful assistant."}, + {Role: "user", Content: "Tell me something interesting."}, + {Role: "assistant", Content: "I'll tell you something interesting about cats"}, + }, + expected: `<|im_start|>system +You are a helpful assistant.<|im_end|> +<|im_start|>user +Tell me something interesting.<|im_end|> +<|im_start|>assistant +I'll tell you something interesting about cats`, + }, + { + name: "basic", + msgs: []api.Message{ + {Role: "system", Content: "You are a helpful assistant."}, + {Role: "user", Content: "Hello, how are you?"}, + }, + expected: `<|im_start|>system +You are a helpful assistant.<|im_end|> +<|im_start|>user +Hello, how are you?<|im_end|> +<|im_start|>assistant +`, + }, + { + name: "With thinking, end assistant.", + msgs: []api.Message{ + {Role: "user", Content: "Tell me a story in two sentences."}, + {Role: "assistant", Content: "abcTo make this story interesting, I will speak in poetry."}, // does the thinking even work? + }, + expected: `<|im_start|>user +Tell me a story in two sentences.<|im_end|> +<|im_start|>assistant +abcTo make this story interesting, I will speak in poetry.`, + }, + { + name: "Multiple thinking", + msgs: []api.Message{ + {Role: "user", Content: "Tell me a story in two sentences."}, + {Role: "assistant", Content: "abcTo make this story interesting, I will speak in poetry.And I will speak in poetry after the first sentence."}, + }, + expected: `<|im_start|>user +Tell me a story in two sentences.<|im_end|> +<|im_start|>assistant +abcTo make this story interesting, I will speak in poetry.And I will speak in poetry after the first sentence.`, // NOTE: the second thinking tag is not captured + }, + { + name: "Multiple thinking, multiple messages.", + msgs: []api.Message{ + {Role: "user", Content: "Tell me a story in two sentences."}, + {Role: "assistant", Content: "abcTo make this story interesting, I will speak in poetry.And I will speak in poetry after the first sentence."}, + {Role: "user", Content: "What is the weather like in San Francisco? I will check the weather in San Francisco for you."}, + {Role: "assistant", Content: "I'll check the weather in San Francisco for you.Speak poetry after the first sentence.Speak poetry after the second sentence."}, + }, + expected: `<|im_start|>user +Tell me a story in two sentences.<|im_end|> +<|im_start|>assistant +abcTo make this story interesting, I will speak in poetry.And I will speak in poetry after the first sentence.<|im_end|> +<|im_start|>user +What is the weather like in San Francisco? I will check the weather in San Francisco for you.<|im_end|> +<|im_start|>assistant +I'll check the weather in San Francisco for you.Speak poetry after the first sentence.Speak poetry after the second sentence.`, + }, + { + name: "Image", + msgs: []api.Message{ + {Role: "user", Content: "Describe this image.", Images: []api.ImageData{api.ImageData("img2")}}, + {Role: "assistant", Content: "Let me analyze this image."}, + }, + expected: `<|im_start|>user +<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|> +<|im_start|>assistant +Let me analyze this image.`, + }, + { + name: "Multiple images", + msgs: []api.Message{ + {Role: "user", Content: "Describe these images.", Images: []api.ImageData{api.ImageData("img1"), api.ImageData("img2")}}, + }, + expected: `<|im_start|>user +<|vision_start|><|image_pad|><|vision_end|><|vision_start|><|image_pad|><|vision_end|>Describe these images.<|im_end|> +<|im_start|>assistant +`, + }, + + // // NOTE: solved with #12518: https://github.com/ollama/ollama/compare/main...drifkin/stable-tool-args + // { + // name: "with tools and response", + // msgs: []api.Message{ + // {Role: "system", Content: "You are a helpful assistant with access to tools."}, + // {Role: "user", Content: "What's the weather like in New York?"}, + // { + // Role: "assistant", + // Content: "I'll check the weather in New York for you.", + // ToolCalls: []api.ToolCall{ + // { + // Function: api.ToolCallFunction{ + // Name: "get-current-weather", + // Arguments: map[string]any{ + // "location": "New York", + // "unit": "fahrenheit", + // }, + // }, + // }, + // }, + // }, + // {Role: "tool", Content: "80", ToolName: "get-current-weather"}, + // {Role: "user", Content: "That sounds nice! What about San Francisco?"}, + // }, + // tools: []api.Tool{ + // { + // Type: "function", + // Function: api.ToolFunction{ + // Name: "get-current-weather", + // Description: "Get the current weather for a location", + // Parameters: api.ToolFunctionParameters{ + // Type: "object", + // Required: []string{"location"}, + // Properties: map[string]api.ToolProperty{ + // "location": { + // Type: api.PropertyType{"string"}, + // Description: "The city and state, e.g. San Francisco, CA", + // }, + // "unit": { + // Type: api.PropertyType{"string"}, + // Enum: []any{"celsius", "fahrenheit"}, + // Description: "The temperature unit", + // }, + // }, + // }, + // }, + // }, + // }, + // expected: `<|im_start|>system + // You are a helpful assistant with access to tools. + + // # Tools + + // You may call one or more functions to assist with the user query. + + // You are provided with function signatures within XML tags: + // + // {"type": "function", "function": {"name": "get-current-weather", "description": "Get the current weather for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city and state, e.g. San Francisco, CA"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "The temperature unit"}}, "required": ["location"]}}} + // + + // For each function call, return a json object with function name and arguments within XML tags: + // + // {"name": , "arguments": } + // <|im_end|> + // <|im_start|>user + // What's the weather like in New York?<|im_end|> + // <|im_start|>assistant + // I'll check the weather in New York for you. + // + // {"name": "get-current-weather", "arguments": {"location": "New York", "unit": "fahrenheit"}} + // <|im_end|> + // <|im_start|>user + // + // 80 + // <|im_end|> + // <|im_start|>user + // That sounds nice! What about San Francisco?<|im_end|> + // <|im_start|>assistant + // `, + // }, + // // NOTE: solved with #12518: https://github.com/ollama/ollama/compare/main...drifkin/stable-tool-args + // { + // name: "With tools and response, multiple tool calls", + // msgs: []api.Message{ + // { + // Role: "system", + // Content: "You are a helpful assistant with access to tools.", + // }, + // { + // Role: "user", + // Content: "Call two tools for me: add and multiply.", + // }, + // { + // Role: "assistant", + // Content: "Sure, I'll call both tools for you.", + // ToolCalls: []api.ToolCall{ + // { + // Function: api.ToolCallFunction{ + // Name: "add", + // Arguments: map[string]any{ + // "a": 2, + // "b": 3, + // }, + // }, + // }, + // { + // Function: api.ToolCallFunction{ + // Name: "multiply", + // Arguments: map[string]any{ + // "x": 4, + // "y": 5, + // }, + // }, + // }, + // }, + // }, + // { + // Role: "tool", + // Content: "5", + // ToolName: "add", + // }, + // { + // Role: "tool", + // Content: "20", + // ToolName: "multiply", + // }, + // { + // Role: "user", + // Content: "Thanks! What are the results?", + // }, + // }, + // tools: []api.Tool{ + // { + // Type: "function", + // Function: api.ToolFunction{ + // Name: "add", + // Description: "Add two numbers", + // Parameters: api.ToolFunctionParameters{ + // Type: "object", + // Required: []string{"a", "b"}, + // Properties: map[string]api.ToolProperty{ + // "a": {Type: api.PropertyType{"integer"}, Description: "First number"}, + // "b": {Type: api.PropertyType{"integer"}, Description: "Second number"}, + // }, + // }, + // }, + // }, + // { + // Type: "function", + // Function: api.ToolFunction{ + // Name: "multiply", + // Description: "Multiply two numbers", + // Parameters: api.ToolFunctionParameters{ + // Type: "object", + // Required: []string{"x", "y"}, + // Properties: map[string]api.ToolProperty{ + // "x": {Type: api.PropertyType{"integer"}, Description: "First factor"}, + // "y": {Type: api.PropertyType{"integer"}, Description: "Second factor"}, + // }, + // }, + // }, + // }, + // }, + // expected: `<|im_start|>system + // You are a helpful assistant with access to tools. + + // # Tools + + // You may call one or more functions to assist with the user query. + + // You are provided with function signatures within XML tags: + // + // {"type": "function", "function": {"name": "add", "description": "Add two numbers", "parameters": {"type": "object", "properties": {"a": {"type": "integer"}, "b": {"type": "integer"}}, "required": ["a", "b"]}}} + // {"type": "function", "function": {"name": "multiply", "description": "Multiply two numbers", "parameters": {"type": "object", "properties": {"x": {"description": "First factor"}, "y": {"description": "Second factor"}}, "required": ["x", "y"]}}} + // + + // For each function call, return a json object with function name and arguments within XML tags: + // + // {"name": , "arguments": } + // <|im_end|> + // <|im_start|>user + // Call two tools for me: add and multiply.<|im_end|> + // <|im_start|>assistant + // Sure, I'll call both tools for you. + // + // {"name": "add", "arguments": {"a": 2, "b": 3}} + // + // + // {"name": "multiply", "arguments": {"x": 4, "y": 5}} + // <|im_end|> + // <|im_start|>user + // + // 5 + // + // + // 20 + // <|im_end|> + // <|im_start|>user + // Thanks! What are the results?<|im_end|> + // <|im_start|>assistant + // `, + // }, + { + name: "user tool_response block preserved", + msgs: []api.Message{ + {Role: "user", Content: "What's the weather?"}, + { + Role: "assistant", + Content: "I'll check.", + ToolCalls: []api.ToolCall{ + {Function: api.ToolCallFunction{Name: "get-current-weather", Arguments: map[string]any{"location": "Paris", "unit": "celsius"}}}, + }, + }, + {Role: "user", Content: "\n18\n"}, + {Role: "user", Content: "Thanks!"}, + }, + expected: `<|im_start|>user +What's the weather?<|im_end|> +<|im_start|>assistant +I'll check. + +{"name": "get-current-weather", "arguments": {"location": "Paris", "unit": "celsius"}} +<|im_end|> +<|im_start|>user + +18 +<|im_end|> +<|im_start|>user +Thanks!<|im_end|> +<|im_start|>assistant +`, + }, + { + name: "assistant with multiple tool calls and content", + msgs: []api.Message{ + {Role: "user", Content: "Hi"}, + { + Role: "assistant", + Content: "before", + ToolCalls: []api.ToolCall{ + {Function: api.ToolCallFunction{Name: "add", Arguments: map[string]any{"a": 2, "b": 3}}}, + {Function: api.ToolCallFunction{Name: "mul", Arguments: map[string]any{"x": 4, "y": 5}}}, + }, + }, + }, + expected: `<|im_start|>user +Hi<|im_end|> +<|im_start|>assistant +before + +{"name": "add", "arguments": {"a": 2, "b": 3}} + + +{"name": "mul", "arguments": {"x": 4, "y": 5}} +`, + }, + { + name: "consecutive tool responses grouped", + msgs: []api.Message{ + {Role: "user", Content: "Compute results"}, + {Role: "assistant", Content: "ok", ToolCalls: []api.ToolCall{{Function: api.ToolCallFunction{Name: "job", Arguments: map[string]any{"n": 1}}}}}, + {Role: "tool", Content: "5", ToolName: "job"}, + {Role: "tool", Content: "6", ToolName: "job"}, + }, + expected: `<|im_start|>user +Compute results<|im_end|> +<|im_start|>assistant +ok + +{"name": "job", "arguments": {"n": 1}} +<|im_end|> +<|im_start|>user + +5 + + +6 +<|im_end|> +<|im_start|>assistant +`, + }, + { + name: "last message is tool then prefill", + msgs: []api.Message{ + {Role: "user", Content: "run"}, + {Role: "assistant", Content: "ok", ToolCalls: []api.ToolCall{{Function: api.ToolCallFunction{Name: "exec", Arguments: map[string]any{"cmd": "ls"}}}}}, + {Role: "tool", Content: "done", ToolName: "exec"}, + }, + expected: `<|im_start|>user +run<|im_end|> +<|im_start|>assistant +ok + +{"name": "exec", "arguments": {"cmd": "ls"}} +<|im_end|> +<|im_start|>user + +done +<|im_end|> +<|im_start|>assistant +`, + }, + { + name: "user with multiple images", + msgs: []api.Message{ + {Role: "user", Content: "Describe.", Images: []api.ImageData{api.ImageData("img1"), api.ImageData("img2")}}, + }, + expected: `<|im_start|>user +<|vision_start|><|image_pad|><|vision_end|><|vision_start|><|image_pad|><|vision_end|>Describe.<|im_end|> +<|im_start|>assistant +`, + }, + { + name: "user tool_response, no whitespace", + msgs: []api.Message{ + {Role: "user", Content: "What's the weather?"}, + { + Role: "assistant", + Content: "I'll check.", + ToolCalls: []api.ToolCall{ + {Function: api.ToolCallFunction{Name: "get-current-weather", Arguments: map[string]any{"location": "Paris", "unit": "celsius"}}}, + }, + }, + {Role: "user", Content: "\n18\n"}, + {Role: "user", Content: "Thanks!"}, + }, + expected: `<|im_start|>user +What's the weather?<|im_end|> +<|im_start|>assistant +I'll check. + +{"name": "get-current-weather", "arguments": {"location": "Paris", "unit": "celsius"}} +<|im_end|> +<|im_start|>user + +18 +<|im_end|> +<|im_start|>user +Thanks!<|im_end|> +<|im_start|>assistant +`, + }, + { + name: "user tool_response with surrounding whitespace", + msgs: []api.Message{ + {Role: "user", Content: "What's the weather?"}, + { + Role: "assistant", + Content: "I'll check.", + ToolCalls: []api.ToolCall{ + {Function: api.ToolCallFunction{Name: "get-current-weather", Arguments: map[string]any{"location": "Paris", "unit": "celsius"}}}, + }, + }, + {Role: "user", Content: "\n\n\n\n\n18\n extra\n\n\n\n\n\n"}, + }, + expected: `<|im_start|>user +What's the weather?<|im_end|> +<|im_start|>assistant +I'll check. + +{"name": "get-current-weather", "arguments": {"location": "Paris", "unit": "celsius"}} +<|im_end|> +<|im_start|>user + + + + + +18 + extra + + + + + +<|im_end|> +<|im_start|>assistant +`, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + rendered, err := (&Qwen3VLRenderer{false}).Render(tt.msgs, tt.tools, nil) + if err != nil { + t.Fatal(err) + } + if diff := cmp.Diff(rendered, tt.expected); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + }) + } +} diff --git a/model/renderers/qwen3vl_test.go b/model/renderers/qwen3vl_test.go new file mode 100644 index 0000000000..6810a7c948 --- /dev/null +++ b/model/renderers/qwen3vl_test.go @@ -0,0 +1,346 @@ +package renderers + +import ( + "testing" + + "github.com/google/go-cmp/cmp" +) + +// TODO(drifkin): this will be moved to utils in the near future and used by other renderers as well +func TestMarshalWithSpaces(t *testing.T) { + tests := []struct { + name string + input any + expected string + }{ + // basic formatting tests + { + name: "simple object", + input: map[string]any{"key": "value"}, + expected: `{"key": "value"}`, + }, + { + name: "simple array", + input: []any{"a", "b", "c"}, + expected: `["a", "b", "c"]`, + }, + // escaped quotes + { + name: "escaped quote in string", + input: map[string]any{"text": `quote"inside`}, + expected: `{"text": "quote\"inside"}`, + }, + { + name: "multiple escaped quotes", + input: map[string]any{"text": `say "hello" and "goodbye"`}, + expected: `{"text": "say \"hello\" and \"goodbye\""}`, + }, + // escaped backslashes + { + name: "escaped backslash", + input: map[string]any{"path": `C:\windows\system32`}, + expected: `{"path": "C:\\windows\\system32"}`, + }, + { + name: "double backslash", + input: map[string]any{"text": `test\\more`}, + expected: `{"text": "test\\\\more"}`, + }, + { + name: "backslash before quote", + input: map[string]any{"text": `end with \"`}, + expected: `{"text": "end with \\\""}`, + }, + // standard JSON escape sequences + { + name: "newline in string", + input: map[string]any{"text": "line1\nline2"}, + expected: `{"text": "line1\nline2"}`, + }, + { + name: "tab in string", + input: map[string]any{"text": "before\tafter"}, + expected: `{"text": "before\tafter"}`, + }, + { + name: "carriage return", + input: map[string]any{"text": "before\rafter"}, + expected: `{"text": "before\rafter"}`, + }, + { + name: "multiple escape sequences", + input: map[string]any{"text": "line1\nline2\ttab\rcarriage"}, + expected: `{"text": "line1\nline2\ttab\rcarriage"}`, + }, + // strings containing colons and commas (no spaces should be added inside) + { + name: "colon in string", + input: map[string]any{"url": "http://example.com"}, + expected: `{"url": "http://example.com"}`, + }, + { + name: "comma in string", + input: map[string]any{"list": "apple, banana, cherry"}, + expected: `{"list": "apple, banana, cherry"}`, + }, + { + name: "colon and comma in string", + input: map[string]any{"data": "key:value, key2:value2"}, + expected: `{"data": "key:value, key2:value2"}`, + }, + // unicode characters + { + name: "emoji", + input: map[string]any{"emoji": "๐Ÿ˜€๐ŸŽ‰โœจ"}, + expected: `{"emoji": "๐Ÿ˜€๐ŸŽ‰โœจ"}`, + }, + { + name: "chinese characters", + input: map[string]any{"text": "ไฝ ๅฅฝไธ–็•Œ"}, + expected: `{"text": "ไฝ ๅฅฝไธ–็•Œ"}`, + }, + { + name: "arabic characters", + input: map[string]any{"text": "ู…ุฑุญุจุง"}, + expected: `{"text": "ู…ุฑุญุจุง"}`, + }, + { + name: "mixed unicode and ascii", + input: map[string]any{"text": "Hello ไธ–็•Œ! ๐Ÿ˜€"}, + expected: `{"text": "Hello ไธ–็•Œ! ๐Ÿ˜€"}`, + }, + { + name: "unicode with special symbols", + input: map[string]any{"text": "ยฎยฉโ„ขโ‚ฌยฃยฅ"}, + expected: `{"text": "ยฎยฉโ„ขโ‚ฌยฃยฅ"}`, + }, + // complex combinations - strings that look like JSON + { + name: "json string inside value", + input: map[string]any{"nested": `{"key":"value"}`}, + expected: `{"nested": "{\"key\":\"value\"}"}`, + }, + { + name: "json array inside value", + input: map[string]any{"array": `["a","b","c"]`}, + expected: `{"array": "[\"a\",\"b\",\"c\"]"}`, + }, + // edge cases + { + name: "empty string", + input: map[string]any{"empty": ""}, + expected: `{"empty": ""}`, + }, + { + name: "empty object", + input: map[string]any{}, + expected: `{}`, + }, + { + name: "empty array", + input: []any{}, + expected: `[]`, + }, + { + name: "numbers", + input: map[string]any{"int": 42, "float": 3.14}, + expected: `{"float": 3.14, "int": 42}`, + }, + { + name: "boolean", + input: map[string]any{"bool": true, "other": false}, + expected: `{"bool": true, "other": false}`, + }, + { + name: "null value", + input: map[string]any{"value": nil}, + expected: `{"value": null}`, + }, + // nested structures with complex strings + { + name: "nested object with escapes", + input: map[string]any{ + "outer": map[string]any{ + "path": `C:\folder\file.txt`, + "quote": `He said "hi"`, + }, + }, + expected: `{"outer": {"path": "C:\\folder\\file.txt", "quote": "He said \"hi\""}}`, + }, + { + name: "array with unicode and escapes", + input: []any{ + "normal", + "with\nnewline", + "with\"quote", + "emoji๐Ÿ˜€", + "colon:comma,", + }, + expected: `["normal", "with\nnewline", "with\"quote", "emoji๐Ÿ˜€", "colon:comma,"]`, + }, + { + name: "backslash at positions before special chars", + input: map[string]any{"text": `a\b:c\d,e`}, + expected: `{"text": "a\\b:c\\d,e"}`, + }, + { + name: "multiple backslashes before quote", + input: map[string]any{"text": `ends\\"`}, + expected: `{"text": "ends\\\\\""}`, + }, + { + name: "unicode with escapes", + input: map[string]any{"text": "Hello\nไธ–็•Œ\t๐Ÿ˜€"}, + expected: `{"text": "Hello\nไธ–็•Œ\t๐Ÿ˜€"}`, + }, + + // Real-world tool call example + { + name: "tool call arguments", + input: map[string]any{ + "location": "San Francisco, CA", + "unit": "fahrenheit", + "format": "json", + }, + expected: `{"format": "json", "location": "San Francisco, CA", "unit": "fahrenheit"}`, + }, + { + name: "complex tool arguments with escapes", + input: map[string]any{ + "query": `SELECT * FROM "users" WHERE name = 'O'Brien'`, + "description": "Fetch user\ndata from DB", + "path": `C:\data\users.db`, + }, + expected: `{"description": "Fetch user\ndata from DB", "path": "C:\\data\\users.db", "query": "SELECT * FROM \"users\" WHERE name = 'O'Brien'"}`, + }, + { + name: "unicode immediately adjacent to JSON structure chars", + input: map[string]any{"๐Ÿ˜€key": "๐Ÿ˜€value", "test": "๐Ÿ˜€:๐Ÿ˜€,๐Ÿ˜€"}, + expected: `{"test": "๐Ÿ˜€:๐Ÿ˜€,๐Ÿ˜€", "๐Ÿ˜€key": "๐Ÿ˜€value"}`, + }, + { + name: "long unicode string stress test", + input: map[string]any{"text": "๐Ÿ˜€๐Ÿ˜๐Ÿ˜‚๐Ÿ˜ƒ๐Ÿ˜„๐Ÿ˜…๐Ÿ˜†๐Ÿ˜‡๐Ÿ˜ˆ๐Ÿ˜‰๐Ÿ˜Š๐Ÿ˜‹๐Ÿ˜Œ๐Ÿ˜๐Ÿ˜Ž๐Ÿ˜๐Ÿ˜๐Ÿ˜‘๐Ÿ˜’๐Ÿ˜“๐Ÿ˜”๐Ÿ˜•๐Ÿ˜–๐Ÿ˜—๐Ÿ˜˜๐Ÿ˜™๐Ÿ˜š๐Ÿ˜›๐Ÿ˜œ๐Ÿ˜๐Ÿ˜ž๐Ÿ˜Ÿ"}, + expected: `{"text": "๐Ÿ˜€๐Ÿ˜๐Ÿ˜‚๐Ÿ˜ƒ๐Ÿ˜„๐Ÿ˜…๐Ÿ˜†๐Ÿ˜‡๐Ÿ˜ˆ๐Ÿ˜‰๐Ÿ˜Š๐Ÿ˜‹๐Ÿ˜Œ๐Ÿ˜๐Ÿ˜Ž๐Ÿ˜๐Ÿ˜๐Ÿ˜‘๐Ÿ˜’๐Ÿ˜“๐Ÿ˜”๐Ÿ˜•๐Ÿ˜–๐Ÿ˜—๐Ÿ˜˜๐Ÿ˜™๐Ÿ˜š๐Ÿ˜›๐Ÿ˜œ๐Ÿ˜๐Ÿ˜ž๐Ÿ˜Ÿ"}`, + }, + { + name: "deeply nested with unicode everywhere", + input: map[string]any{ + "๐Ÿ˜€": map[string]any{ + "ไฝ ๅฅฝ": []any{"ู…ุฑุญุจุง", "ยฎยฉโ„ข", "โˆ‘โˆซโˆ‚โˆš"}, + }, + }, + expected: `{"๐Ÿ˜€": {"ไฝ ๅฅฝ": ["ู…ุฑุญุจุง", "ยฎยฉโ„ข", "โˆ‘โˆซโˆ‚โˆš"]}}`, + }, + { + name: "unicode with all JSON special chars interleaved", + input: map[string]any{"k๐Ÿ˜€:k": "v๐Ÿ˜€,v", "a:๐Ÿ˜€": "b,๐Ÿ˜€", "๐Ÿ˜€": ":,๐Ÿ˜€,:"}, + expected: `{"a:๐Ÿ˜€": "b,๐Ÿ˜€", "k๐Ÿ˜€:k": "v๐Ÿ˜€,v", "๐Ÿ˜€": ":,๐Ÿ˜€,:"}`, + }, + { + name: "combining diacritics and RTL text", + input: map[string]any{"hebrew": "ืขึดื‘ึฐืจึดื™ืช", "combined": "รฉฬ€รฑ", "mixed": "test:ืขึดื‘ึฐืจึดื™ืช,รฉฬ€รฑ"}, + expected: `{"combined": "รฉฬ€รฑ", "hebrew": "ืขึดื‘ึฐืจึดื™ืช", "mixed": "test:ืขึดื‘ึฐืจึดื™ืช,รฉฬ€รฑ"}`, + }, + { + name: "pathological case: unicode + escapes + special chars", + input: map[string]any{"๐Ÿ˜€": "test\n๐Ÿ˜€\"quote๐Ÿ˜€\\backslash๐Ÿ˜€:colon๐Ÿ˜€,comma๐Ÿ˜€"}, + expected: `{"๐Ÿ˜€": "test\n๐Ÿ˜€\"quote๐Ÿ˜€\\backslash๐Ÿ˜€:colon๐Ÿ˜€,comma๐Ÿ˜€"}`, + }, + + // all JSON structural characters inside strings + { + name: "braces and brackets in strings", + input: map[string]any{"text": "test{with}braces[and]brackets"}, + expected: `{"text": "test{with}braces[and]brackets"}`, + }, + { + name: "braces and brackets with colons and commas", + input: map[string]any{"code": "{key:value,[1,2,3]}"}, + expected: `{"code": "{key:value,[1,2,3]}"}`, + }, + { + name: "json-like string with all structural chars", + input: map[string]any{"schema": `{"type":"object","properties":{"name":{"type":"string"},"items":{"type":"array"}}}`}, + expected: `{"schema": "{\"type\":\"object\",\"properties\":{\"name\":{\"type\":\"string\"},\"items\":{\"type\":\"array\"}}}"}`, + }, + + // forward slash tests (JSON allows \/ as an escape sequence) + { + name: "forward slash in URL", + input: map[string]any{"url": "https://example.com/path/to/resource"}, + expected: `{"url": "https://example.com/path/to/resource"}`, + }, + { + name: "regex pattern with slashes", + input: map[string]any{"regex": "/[a-z]+/gi"}, + expected: `{"regex": "/[a-z]+/gi"}`, + }, + + // all JSON escape sequences + { + name: "backspace escape", + input: map[string]any{"text": "before\bafter"}, + expected: `{"text": "before\bafter"}`, + }, + { + name: "form feed escape", + input: map[string]any{"text": "before\fafter"}, + expected: `{"text": "before\fafter"}`, + }, + { + name: "all standard escapes combined", + input: map[string]any{"text": "\"\\\b\f\n\r\t"}, + expected: `{"text": "\"\\\b\f\n\r\t"}`, + }, + + // unicode escape sequences + { + name: "string that forces unicode escapes", + input: map[string]any{"control": "\u0000\u0001\u001f"}, + expected: `{"control": "\u0000\u0001\u001f"}`, + }, + + // empty objects and arrays nested with strings + { + name: "nested empty structures with string values", + input: map[string]any{"empty_obj": map[string]any{}, "empty_arr": []any{}, "text": "{}[]"}, + expected: `{"empty_arr": [], "empty_obj": {}, "text": "{}[]"}`, + }, + + // complex nesting with all structural characters + { + name: "deeply nested with all char types", + input: map[string]any{ + "level1": map[string]any{ + "array": []any{ + map[string]any{"nested": "value:with,special{chars}[here]"}, + []any{"a", "b", "c"}, + }, + }, + }, + expected: `{"level1": {"array": [{"nested": "value:with,special{chars}[here]"}, ["a", "b", "c"]]}}`, + }, + + // string containing escaped structural characters + { + name: "string with multiple escape sequences and structural chars", + input: map[string]any{"data": "test\"quote\"{brace}[bracket]:colon,comma\\backslash/slash"}, + expected: `{"data": "test\"quote\"{brace}[bracket]:colon,comma\\backslash/slash"}`, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := marshalWithSpaces(tt.input) + if err != nil { + t.Fatalf("marshalWithSpaces failed: %v", err) + } + + resultStr := string(result) + if diff := cmp.Diff(resultStr, tt.expected); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + }) + } +} diff --git a/model/renderers/qwen3vl_thinking_test.go b/model/renderers/qwen3vl_thinking_test.go new file mode 100644 index 0000000000..2bb2c2d0d9 --- /dev/null +++ b/model/renderers/qwen3vl_thinking_test.go @@ -0,0 +1,372 @@ +package renderers + +import ( + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/ollama/ollama/api" +) + +func TestQwen3VLThinkingRenderer(t *testing.T) { + tests := []struct { + name string + msgs []api.Message + images []api.ImageData + tools []api.Tool + expected string + }{ + { + name: "basic", + msgs: []api.Message{ + {Role: "system", Content: "You are a helpful assistant."}, + {Role: "user", Content: "Hello, how are you?"}, + }, + expected: `<|im_start|>system +You are a helpful assistant.<|im_end|> +<|im_start|>user +Hello, how are you?<|im_end|> +<|im_start|>assistant + +`, + }, + { + name: "With thinking, end assistant.", + msgs: []api.Message{ + {Role: "user", Content: "Tell me a story in two sentences."}, + {Role: "assistant", Content: "abc", Thinking: "To make this story interesting, I will speak in poetry."}, + }, + expected: `<|im_start|>user +Tell me a story in two sentences.<|im_end|> +<|im_start|>assistant + +To make this story interesting, I will speak in poetry. + + +abc`, + }, + { + name: "With thinking, end assistant.", + msgs: []api.Message{ + {Role: "user", Content: "Tell me a story in two sentences."}, + {Role: "assistant", Thinking: "To make this story interesting, I will speak in poetry."}, + }, + expected: `<|im_start|>user +Tell me a story in two sentences.<|im_end|> +<|im_start|>assistant + +To make this story interesting, I will speak in poetry.`, + }, + { + name: "Multiple thinking", + msgs: []api.Message{ + {Role: "user", Content: "Tell me a story in two sentences."}, + {Role: "assistant", Content: "abc", Thinking: "To make this story interesting, I will speak in poetry.And I will speak in poetry after the first sentence."}, + }, + expected: `<|im_start|>user +Tell me a story in two sentences.<|im_end|> +<|im_start|>assistant + +To make this story interesting, I will speak in poetry.And I will speak in poetry after the first sentence. + + +abc`, // NOTE: the second thinking tag is not captured + }, + { + name: "Multiple thinking, multiple messages.", + msgs: []api.Message{ + {Role: "user", Content: "Tell me a story in two sentences."}, + {Role: "assistant", Thinking: "To make this story interesting, I will speak in poetry.", Content: "abc"}, + {Role: "user", Content: "What is the weather like in San Francisco?"}, + {Role: "assistant", Thinking: "Speak poetry after the first sentence.Speak poetry after the second sentence."}, + }, + expected: `<|im_start|>user +Tell me a story in two sentences.<|im_end|> +<|im_start|>assistant +abc<|im_end|> +<|im_start|>user +What is the weather like in San Francisco?<|im_end|> +<|im_start|>assistant + +Speak poetry after the first sentence.Speak poetry after the second sentence.`, + }, + // NOTE: Servers automatically prepend a [img-] tag + // { + // name: "Image", + // msgs: []api.Message{ + // {Role: "user", Content: "Describe this image.", Images: []api.ImageData{api.ImageData(IMAGE2_BASE64)}}, + // }, + // expected: `<|im_start|>user + // [img-0]Describe this image.<|im_end|> + // <|im_start|>assistant + // + // `, + // }, + + // NOTE: Servers automatically prepend a [img-] tag + // { + // name: "Multiple images", + // msgs: []api.Message{ + // {Role: "user", Content: "Describe these images.", Images: []api.ImageData{api.ImageData(IMAGE1_BASE64), api.ImageData(IMAGE2_BASE64)}}, + // }, + // expected: `<|im_start|>user + // [img-0][img-1]Describe these images.<|im_end|> + // <|im_start|>assistant + // + // `, + // }, + + // NOTE: solved with #12518: https://github.com/ollama/ollama/compare/main...drifkin/stable-tool-args + // { + // name: "with tools and response", + // msgs: []api.Message{ + // {Role: "system", Content: "You are a helpful assistant with access to tools."}, + // {Role: "user", Content: "What's the weather like in New York?"}, + // { + // Role: "assistant", + // Content: "I'll check the weather in New York for you.", + // ToolCalls: []api.ToolCall{ + // { + // Function: api.ToolCallFunction{ + // Name: "get-current-weather", + // Arguments: map[string]any{ + // "location": "New York", + // "unit": "fahrenheit", + // }, + // }, + // }, + // }, + // }, + // {Role: "tool", Content: "80", ToolName: "get-current-weather"}, + // {Role: "user", Content: "That sounds nice! What about San Francisco?"}, + // }, + // tools: []api.Tool{ + // { + // Type: "function", + // Function: api.ToolFunction{ + // Name: "get-current-weather", + // Description: "Get the current weather for a location", + // Parameters: api.ToolFunctionParameters{ + // Type: "object", + // Required: []string{"location"}, + // Properties: map[string]api.ToolProperty{ + // "location": { + // Type: api.PropertyType{"string"}, + // Description: "The city and state, e.g. San Francisco, CA", + // }, + // "unit": { + // Type: api.PropertyType{"string"}, + // Enum: []any{"celsius", "fahrenheit"}, + // Description: "The temperature unit", + // }, + // }, + // }, + // }, + // }, + // }, + // expected: `<|im_start|>system + // You are a helpful assistant with access to tools. + + // # Tools + + // You may call one or more functions to assist with the user query. + + // You are provided with function signatures within XML tags: + // + // {"type": "function", "function": {"name": "get-current-weather", "description": "Get the current weather for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city and state, e.g. San Francisco, CA"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "The temperature unit"}}, "required": ["location"]}}} + // + + // For each function call, return a json object with function name and arguments within XML tags: + // + // {"name": , "arguments": } + // <|im_end|> + // <|im_start|>user + // What's the weather like in New York?<|im_end|> + // <|im_start|>assistant + // I'll check the weather in New York for you. + // + // {"name": "get-current-weather", "arguments": {"location": "New York", "unit": "fahrenheit"}} + // <|im_end|> + // <|im_start|>user + // + // 80 + // <|im_end|> + // <|im_start|>user + // That sounds nice! What about San Francisco?<|im_end|> + // <|im_start|>assistant + // + // `, + // }, + + // NOTE: solved with #12518: https://github.com/ollama/ollama/compare/main...drifkin/stable-tool-args + // { + // name: "With tools and response, multiple tool calls", + // msgs: []api.Message{ + // { + // Role: "system", + // Content: "You are a helpful assistant with access to tools.", + // }, + // { + // Role: "user", + // Content: "Call two tools for me: add and multiply.", + // }, + // { + // Role: "assistant", + // Content: "Sure, I'll call both tools for you.", + // ToolCalls: []api.ToolCall{ + // { + // Function: api.ToolCallFunction{ + // Name: "add", + // Arguments: map[string]any{ + // "a": 2, + // "b": 3, + // }, + // }, + // }, + // { + // Function: api.ToolCallFunction{ + // Name: "multiply", + // Arguments: map[string]any{ + // "x": 4, + // "y": 5, + // }, + // }, + // }, + // }, + // }, + // { + // Role: "tool", + // Content: "5", + // ToolName: "add", + // }, + // { + // Role: "tool", + // Content: "20", + // ToolName: "multiply", + // }, + // { + // Role: "user", + // Content: "Thanks! What are the results?", + // }, + // }, + // tools: []api.Tool{ + // { + // Type: "function", + // Function: api.ToolFunction{ + // Name: "add", + // Description: "Add two numbers", + // Parameters: api.ToolFunctionParameters{ + // Type: "object", + // Required: []string{"a", "b"}, + // Properties: map[string]api.ToolProperty{ + // "a": {Type: api.PropertyType{"integer"}, Description: "First number"}, + // "b": {Type: api.PropertyType{"integer"}, Description: "Second number"}, + // }, + // }, + // }, + // }, + // { + // Type: "function", + // Function: api.ToolFunction{ + // Name: "multiply", + // Description: "Multiply two numbers", + // Parameters: api.ToolFunctionParameters{ + // Type: "object", + // Required: []string{"x", "y"}, + // Properties: map[string]api.ToolProperty{ + // "x": {Type: api.PropertyType{"integer"}, Description: "First factor"}, + // "y": {Type: api.PropertyType{"integer"}, Description: "Second factor"}, + // }, + // }, + // }, + // }, + // }, + // expected: `<|im_start|>system + // You are a helpful assistant with access to tools. + + // # Tools + + // You may call one or more functions to assist with the user query. + + // You are provided with function signatures within XML tags: + // + // {"type": "function", "function": {"name": "add", "description": "Add two numbers", "parameters": {"type": "object", "properties": {"a": {"type": "integer"}, "b": {"type": "integer"}}, "required": ["a", "b"]}}} + // {"type": "function", "function": {"name": "multiply", "description": "Multiply two numbers", "parameters": {"type": "object", "properties": {"x": {"type": "integer"}, "y": {"type": "integer"}}, "required": ["x", "y"]}}} + // + + // For each function call, return a json object with function name and arguments within XML tags: + // + // {"name": , "arguments": } + // <|im_end|> + // <|im_start|>user + // Call two tools for me: add and multiply.<|im_end|> + // <|im_start|>assistant + // Sure, I'll call both tools for you. + // + // {"name": "add", "arguments": {"a": 2, "b": 3}} + // + // + // {"name": "multiply", "arguments": {"x": 4, "y": 5}} + // <|im_end|> + // <|im_start|>user + // + // 5 + // + // + // 20 + // <|im_end|> + // <|im_start|>user + // Thanks! What are the results?<|im_end|> + // <|im_start|>assistant + // + // `, + // }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + rendered, err := (&Qwen3VLRenderer{true}).Render(tt.msgs, tt.tools, nil) + if err != nil { + t.Fatal(err) + } + if diff := cmp.Diff(rendered, tt.expected); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + }) + } +} + +func TestFormatToolCallArgumentThinkingVL(t *testing.T) { + tests := []struct { + name string + arg any + expected string + }{ + { + name: "string", + arg: "foo", + expected: "foo", + }, + { + name: "map", + arg: map[string]any{"foo": "bar"}, + expected: "{\"foo\":\"bar\"}", + }, + { + name: "number", + arg: 1, + expected: "1", + }, + { + name: "boolean", + arg: true, + expected: "true", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := formatToolCallArgument(tt.arg) + if got != tt.expected { + t.Errorf("formatToolCallArgument(%v) = %v, want %v", tt.arg, got, tt.expected) + } + }) + } +} diff --git a/model/renderers/renderer.go b/model/renderers/renderer.go index 2dfb51e490..01e0c6ee2a 100644 --- a/model/renderers/renderer.go +++ b/model/renderers/renderer.go @@ -1,25 +1,19 @@ package renderers -import ( - "fmt" +import "github.com/ollama/ollama/api" - "github.com/ollama/ollama/api" -) - -type rendererFunc func([]api.Message, []api.Tool, *api.ThinkValue) (string, error) - -func RenderWithRenderer(name string, msgs []api.Message, tools []api.Tool, think *api.ThinkValue) (string, error) { - renderer := rendererForName(name) - if renderer == nil { - return "", fmt.Errorf("unknown renderer %q", name) - } - return renderer(msgs, tools, think) +type Renderer interface { + Render(messages []api.Message, tools []api.Tool, think *api.ThinkValue) (string, error) } -func rendererForName(name string) rendererFunc { +func RendererForName(name string) Renderer { switch name { case "qwen3-coder": - return Qwen3CoderRenderer + renderer := &Qwen3CoderRenderer{} + return renderer + case "qwen3-vl-instruct": + renderer := &Qwen3VLRenderer{false} + return renderer default: return nil } diff --git a/openai/openai.go b/openai/openai.go index a01c5d35c2..0e2f7313a8 100644 --- a/openai/openai.go +++ b/openai/openai.go @@ -456,6 +456,11 @@ func FromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { types := []string{"jpeg", "jpg", "png", "webp"} valid := false + // support blank mime type to match api/chat taking just unadorned base64 + if strings.HasPrefix(url, "data:;base64,") { + url = strings.TrimPrefix(url, "data:;base64,") + valid = true + } for _, t := range types { prefix := "data:image/" + t + ";base64," if strings.HasPrefix(url, prefix) { diff --git a/server/prompt.go b/server/prompt.go index 2175919821..f1f7cec4de 100644 --- a/server/prompt.go +++ b/server/prompt.go @@ -106,7 +106,8 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. func renderPrompt(m *Model, msgs []api.Message, tools []api.Tool, think *api.ThinkValue) (string, error) { if m.Config.Renderer != "" { - rendered, err := renderers.RenderWithRenderer(m.Config.Renderer, msgs, tools, think) + renderer := renderers.RendererForName(m.Config.Renderer) + rendered, err := renderer.Render(msgs, tools, think) if err != nil { return "", err }