mirror of
https://github.com/ollama/ollama.git
synced 2025-11-10 21:57:30 +01:00
In <https://github.com/ollama/ollama/issues/12357> we that the model will output tool calls such as ``` <function=shell> <parameter=command> pwd && ls -la </parameter> </function> ``` We parse this using the approach of transforming into valid xml and then using an xml parser. While we do transform the function and parameter names, we weren't escaping the parameter values (which in this example are invalid since `pwd && ls -la` contains unescaped ampersands). This has been fixed by first transforming the tags in the same way, and then walking the transformed string and escaping the text in between the tags. This also fixes a case where `<` in the middle of a parameter value would cause an xml parse failure. Fixes: #12357
448 lines
13 KiB
Go
448 lines
13 KiB
Go
package parsers
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"encoding/xml"
|
|
"fmt"
|
|
"log/slog"
|
|
"math"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
"unicode"
|
|
|
|
"github.com/ollama/ollama/api"
|
|
"github.com/ollama/ollama/logutil"
|
|
)
|
|
|
|
type qwenParserState int
|
|
|
|
const (
|
|
toolOpenTag = "<tool_call>"
|
|
toolCloseTag = "</tool_call>"
|
|
)
|
|
|
|
const (
|
|
qwenParserState_LookingForToolStart qwenParserState = iota
|
|
qwenParserState_CollectingToolContent
|
|
)
|
|
|
|
type Qwen3CoderParser struct {
|
|
state qwenParserState
|
|
acc strings.Builder
|
|
}
|
|
|
|
func (p *Qwen3CoderParser) HasToolSupport() bool {
|
|
return true
|
|
}
|
|
|
|
func (p *Qwen3CoderParser) HasThinkingSupport() bool {
|
|
return false
|
|
}
|
|
|
|
func (p *Qwen3CoderParser) Add(s string, tools []api.Tool) (content string, thinking string, calls []api.ToolCall, err error) {
|
|
p.acc.WriteString(s)
|
|
|
|
events := p.parseEvents()
|
|
|
|
var toolCalls []api.ToolCall
|
|
var sb strings.Builder
|
|
for _, event := range events {
|
|
switch event := event.(type) {
|
|
case qwenEventRawToolCall:
|
|
toolCall, err := parseToolCall(event, tools)
|
|
if err != nil {
|
|
slog.Warn("qwen tool call parsing failed", "error", err)
|
|
return "", "", nil, err
|
|
}
|
|
toolCalls = append(toolCalls, toolCall)
|
|
case qwenEventContent:
|
|
// TODO(drifkin): if the same turn contains multiple interleaved content
|
|
// events, we naively append them together here. See the note below about
|
|
// `qwenEvent`s for more details
|
|
sb.WriteString(event.content)
|
|
}
|
|
}
|
|
|
|
return sb.String(), "", toolCalls, nil
|
|
}
|
|
|
|
func (p *Qwen3CoderParser) parseEvents() []qwenEvent {
|
|
var all []qwenEvent
|
|
|
|
keepLooping := true
|
|
for keepLooping {
|
|
var events []qwenEvent
|
|
events, keepLooping = eat(p)
|
|
if len(events) > 0 {
|
|
all = append(all, events...)
|
|
}
|
|
}
|
|
|
|
if len(all) > 0 {
|
|
slog.Log(context.TODO(), logutil.LevelTrace, "qwen events parsed", "events", all, "state", p.state, "acc", p.acc.String())
|
|
}
|
|
|
|
return all
|
|
}
|
|
|
|
// we use some internal event types in order to communicate between `Add` and
|
|
// `eat`. We do this to support interleaving content and parallel tool calls in
|
|
// the parser, even though qwen3-coder isn't supposed to do this. Our API
|
|
// doesn't currently support models outputting multiple messages in a turn, so
|
|
// we wouldn't be able to represent it yet, but there's no reason to prevent the
|
|
// parser from supporting it, especially for future models if they end up using
|
|
// a similar format.
|
|
type qwenEvent interface {
|
|
isQwenEvent()
|
|
}
|
|
|
|
type qwenEventRawToolCall struct {
|
|
raw string
|
|
}
|
|
|
|
type qwenEventContent struct {
|
|
content string
|
|
}
|
|
|
|
func (qwenEventContent) isQwenEvent() {}
|
|
func (qwenEventRawToolCall) isQwenEvent() {}
|
|
|
|
// eat consumes the parser's buffer, and returns a list of any unambiguous
|
|
// events from the current parser state. If the parser transitions to another
|
|
// state, it may have additional events to emit on the next call, which is what
|
|
// the second return value indicates
|
|
func eat(p *Qwen3CoderParser) ([]qwenEvent, bool) {
|
|
var events []qwenEvent
|
|
|
|
switch p.state {
|
|
case qwenParserState_LookingForToolStart:
|
|
if strings.Contains(p.acc.String(), toolOpenTag) {
|
|
// we found a full tool open tag, so we can emit the content before the
|
|
// tag, being sure to trim any trailing whitespace
|
|
split := strings.SplitN(p.acc.String(), toolOpenTag, 2)
|
|
before := split[0]
|
|
before = strings.TrimRightFunc(before, unicode.IsSpace)
|
|
if len(before) > 0 {
|
|
events = append(events, qwenEventContent{content: before})
|
|
}
|
|
after := split[1]
|
|
p.acc.Reset()
|
|
p.acc.WriteString(after)
|
|
p.state = qwenParserState_CollectingToolContent
|
|
return events, true
|
|
} else if overlap := overlap(p.acc.String(), toolOpenTag); overlap > 0 {
|
|
// we found a partial tool open tag, so we can emit the unambiguous part,
|
|
// which is the (trailing-whitespace trimmed) content before the partial
|
|
// tool open tag
|
|
beforePartialTag := p.acc.String()[:len(p.acc.String())-overlap]
|
|
trailingWhitespaceLen := trailingWhitespaceLen(beforePartialTag)
|
|
ambiguousStart := len(beforePartialTag) - trailingWhitespaceLen
|
|
unambiguous := p.acc.String()[:ambiguousStart]
|
|
ambiguous := p.acc.String()[ambiguousStart:]
|
|
p.acc.Reset()
|
|
p.acc.WriteString(ambiguous)
|
|
events = append(events, qwenEventContent{content: unambiguous})
|
|
return events, false
|
|
} else {
|
|
// we found content that is entirely not a tool call. We should withhold
|
|
// any trailing whitespace in case this is the end of the content
|
|
whitespaceLen := trailingWhitespaceLen(p.acc.String())
|
|
ambiguousStart := len(p.acc.String()) - whitespaceLen
|
|
unambiguous := p.acc.String()[:ambiguousStart]
|
|
ambiguous := p.acc.String()[ambiguousStart:]
|
|
p.acc.Reset()
|
|
p.acc.WriteString(ambiguous)
|
|
if len(unambiguous) > 0 {
|
|
events = append(events, qwenEventContent{content: unambiguous})
|
|
}
|
|
return events, false
|
|
}
|
|
case qwenParserState_CollectingToolContent:
|
|
if strings.Contains(p.acc.String(), toolCloseTag) {
|
|
split := strings.SplitN(p.acc.String(), toolCloseTag, 2)
|
|
before := split[0]
|
|
if len(before) == 0 {
|
|
slog.Warn("qwen tool call closing tag found but no content before it")
|
|
}
|
|
// remove any whitespace between the tool call and any content after it
|
|
after := strings.TrimLeftFunc(split[1], unicode.IsSpace)
|
|
p.acc.Reset()
|
|
p.acc.WriteString(after)
|
|
events = append(events, qwenEventRawToolCall{raw: before})
|
|
p.state = qwenParserState_LookingForToolStart
|
|
return events, true
|
|
} else {
|
|
// note that we don't need to check the overlap here because we only plan
|
|
// on parsing the tool call once we see the full closing tag. We don't
|
|
// stream back the unparsed tool content, so there's no need to be eager
|
|
// here
|
|
return events, false
|
|
}
|
|
default:
|
|
panic("unreachable")
|
|
}
|
|
}
|
|
|
|
// TODO(drifkin): move this to a shared location
|
|
// longest overlap between suffix of s and prefix of delim
|
|
func overlap(s, delim string) int {
|
|
max := min(len(delim), len(s))
|
|
for i := max; i > 0; i-- {
|
|
if strings.HasSuffix(s, delim[:i]) {
|
|
return i
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func trailingWhitespaceLen(s string) int {
|
|
for i := len(s) - 1; i >= 0; i-- {
|
|
if !unicode.IsSpace(rune(s[i])) {
|
|
return len(s) - i - 1
|
|
}
|
|
}
|
|
return len(s)
|
|
}
|
|
|
|
type XMLFunctionCall struct {
|
|
XMLName xml.Name `xml:"function"`
|
|
Name string `xml:"name,attr"`
|
|
Parameters []XMLParameter `xml:"parameter"`
|
|
}
|
|
|
|
type XMLParameter struct {
|
|
Name string `xml:"name,attr"`
|
|
Value string `xml:",chardata"`
|
|
}
|
|
|
|
// parseToolCall parses a raw tool call string into an api.ToolCall.
|
|
// The raw string follows an xml-like format, here's an example:
|
|
//
|
|
// <function=get_current_temperature>
|
|
// <parameter=location>
|
|
// San Francisco
|
|
// </parameter>
|
|
// <parameter=unit>
|
|
// celsius
|
|
// </parameter>
|
|
// </function>
|
|
func parseToolCall(raw qwenEventRawToolCall, tools []api.Tool) (api.ToolCall, error) {
|
|
toolCall := api.ToolCall{}
|
|
|
|
xmlString := transformToXML(raw.raw)
|
|
|
|
var functionCall XMLFunctionCall
|
|
err := xml.Unmarshal([]byte(xmlString), &functionCall)
|
|
if err != nil {
|
|
return api.ToolCall{}, err
|
|
}
|
|
|
|
toolCall.Function = api.ToolCallFunction{
|
|
Name: functionCall.Name,
|
|
}
|
|
|
|
// Find the matching tool to get parameter types
|
|
var matchedTool *api.Tool
|
|
for i := range tools {
|
|
if tools[i].Function.Name == functionCall.Name {
|
|
matchedTool = &tools[i]
|
|
break
|
|
}
|
|
}
|
|
|
|
toolCall.Function.Arguments = make(api.ToolCallFunctionArguments)
|
|
for _, parameter := range functionCall.Parameters {
|
|
// Look up the parameter type if we found the tool
|
|
var paramType api.PropertyType
|
|
if matchedTool != nil && matchedTool.Function.Parameters.Properties != nil {
|
|
if prop, ok := matchedTool.Function.Parameters.Properties[parameter.Name]; ok {
|
|
paramType = prop.Type
|
|
}
|
|
}
|
|
|
|
toolCall.Function.Arguments[parameter.Name] = parseValue(parameter.Value, paramType)
|
|
}
|
|
|
|
return toolCall, nil
|
|
}
|
|
|
|
// parseValue converts a raw string value to the appropriate type based on the parameter type specification.
|
|
//
|
|
// For union types (multiple types in PropertyType, which we support but doesn't
|
|
// seem as though the reference parser does type coercion with those types in
|
|
// mind) we use a type precedence approach:
|
|
// 1. null - checked first regardless of declared types (matches reference implementation)
|
|
// 2. boolean - only "true"/"false" are valid booleans
|
|
// 3. integer - must parse as a whole number
|
|
// 4. number - must parse as numeric (returns int if no decimal part)
|
|
// 5. array - must parse as valid JSON array
|
|
// 6. object - must parse as valid JSON object
|
|
// 7. string - always succeeds (least specific type)
|
|
//
|
|
// This precedence ensures we return the most specific type that successfully parses,
|
|
// following the principle of least surprise. For example, with PropertyType{"string", "number"},
|
|
// "123" becomes 123 (number), while "hello" becomes "hello" (string).
|
|
func parseValue(raw string, paramType api.PropertyType) any {
|
|
// first remove a single leading newlines, and a single trailing newline (if
|
|
// they exist). This follows the reference implementation
|
|
raw = strings.TrimPrefix(raw, "\n")
|
|
raw = strings.TrimSuffix(raw, "\n")
|
|
|
|
// Check for null first (case-insensitive) - this takes precedence over any type
|
|
if strings.ToLower(raw) == "null" {
|
|
return nil
|
|
}
|
|
|
|
// If no type is specified, default to string
|
|
if len(paramType) == 0 {
|
|
return raw
|
|
}
|
|
|
|
// Check if any of the specified types match, using type precedence
|
|
// Order: boolean -> integer -> number -> array -> object -> string
|
|
typeSet := make(map[string]bool)
|
|
for _, t := range paramType {
|
|
typeSet[t] = true
|
|
}
|
|
|
|
// Try boolean first (most restrictive)
|
|
if typeSet["boolean"] {
|
|
lower := strings.ToLower(raw)
|
|
switch lower {
|
|
case "true":
|
|
return true
|
|
case "false":
|
|
return false
|
|
}
|
|
// If not a valid boolean but boolean is the only type, return false (matching reference)
|
|
if len(paramType) == 1 {
|
|
return false
|
|
}
|
|
// Otherwise try other types
|
|
}
|
|
|
|
// Try integer
|
|
if typeSet["integer"] {
|
|
if i, err := strconv.ParseInt(raw, 10, 64); err == nil {
|
|
// Return as int if it fits in int32, otherwise int64
|
|
if i >= math.MinInt32 && i <= math.MaxInt32 {
|
|
return int(i)
|
|
}
|
|
return i
|
|
}
|
|
// If integer is the only type and parsing failed, fall back to string
|
|
if len(paramType) == 1 {
|
|
return raw
|
|
}
|
|
}
|
|
|
|
// Try number (float)
|
|
if typeSet["number"] {
|
|
if f, err := strconv.ParseFloat(raw, 64); err == nil {
|
|
// If the number has no decimal part, return as int (matching reference)
|
|
if f == math.Trunc(f) {
|
|
i := int64(f)
|
|
if i >= math.MinInt32 && i <= math.MaxInt32 {
|
|
return int(i)
|
|
}
|
|
return i
|
|
}
|
|
return f
|
|
}
|
|
// If number is the only type and parsing failed, fall back to string
|
|
if len(paramType) == 1 {
|
|
return raw
|
|
}
|
|
}
|
|
|
|
// Try array
|
|
if typeSet["array"] {
|
|
var arr []interface{}
|
|
if err := json.Unmarshal([]byte(raw), &arr); err == nil {
|
|
return arr
|
|
}
|
|
// If array is the only type and parsing failed, fall back to string
|
|
if len(paramType) == 1 {
|
|
return raw
|
|
}
|
|
}
|
|
|
|
// Try object
|
|
if typeSet["object"] {
|
|
var obj map[string]interface{}
|
|
if err := json.Unmarshal([]byte(raw), &obj); err == nil {
|
|
return obj
|
|
}
|
|
// If object is the only type and parsing failed, fall back to string
|
|
if len(paramType) == 1 {
|
|
return raw
|
|
}
|
|
}
|
|
|
|
// String always succeeds (or if "string" is in the type set)
|
|
if typeSet["string"] {
|
|
return raw
|
|
}
|
|
|
|
// If we get here, none of the types matched and string wasn't an option
|
|
// We return string as a fallback. The reference implementation will attempt
|
|
// to parse the value as a python literal, but we purposefully don't support
|
|
// that
|
|
return raw
|
|
}
|
|
|
|
var (
|
|
qwenTagRegex = regexp.MustCompile(`<(\w+)=([^>]+)>`)
|
|
qwenXMLTagRegex = regexp.MustCompile(`</?(?:function|parameter)(?:\s+name="[^"]*")?>`)
|
|
)
|
|
|
|
// transformToXML transforms a raw qwen tool call with xml-like tags into valid
|
|
// xml so that it can be parsed by any xml parser
|
|
func transformToXML(raw string) string {
|
|
// take the form `<tag=abc>` and transform it to `<tag name="abc">`, taking
|
|
// care to properly escape the string that becomes the attribute value
|
|
transformed := qwenTagRegex.ReplaceAllStringFunc(raw, func(match string) string {
|
|
groups := qwenTagRegex.FindStringSubmatch(match)
|
|
tag := groups[1]
|
|
var escapedValue strings.Builder
|
|
xml.EscapeText(&escapedValue, []byte(groups[2]))
|
|
return fmt.Sprintf(`<%s name="%s">`, tag, escapedValue.String())
|
|
})
|
|
|
|
// Walk the resulting string, escaping any character data that sits between the
|
|
// xml tags we just emitted
|
|
var out strings.Builder
|
|
lastIdx := 0
|
|
for _, loc := range qwenXMLTagRegex.FindAllStringIndex(transformed, -1) {
|
|
if loc[0] > lastIdx {
|
|
escapeTextNode(&out, transformed[lastIdx:loc[0]])
|
|
}
|
|
out.WriteString(transformed[loc[0]:loc[1]])
|
|
lastIdx = loc[1]
|
|
}
|
|
if lastIdx < len(transformed) {
|
|
escapeTextNode(&out, transformed[lastIdx:])
|
|
}
|
|
|
|
return out.String()
|
|
}
|
|
|
|
// escapeTextNode escapes XML character data without altering other characters
|
|
// like newlines or tabs (which is why we don't use xml.EscapeText for this)
|
|
func escapeTextNode(sb *strings.Builder, s string) {
|
|
for _, r := range s {
|
|
switch r {
|
|
case '&':
|
|
sb.WriteString("&")
|
|
case '<':
|
|
sb.WriteString("<")
|
|
case '>':
|
|
sb.WriteString(">")
|
|
default:
|
|
sb.WriteRune(r)
|
|
}
|
|
}
|
|
}
|