mirror of
https://github.com/ollama/ollama.git
synced 2025-11-12 10:47:40 +01:00
routes: structured outputs for gpt-oss (#12460)
This commit is contained in:
@@ -1979,14 +1979,42 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
|||||||
toolParser = tools.NewParser(m.Template.Template, req.Tools)
|
toolParser = tools.NewParser(m.Template.Template, req.Tools)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type structuredOutputsState int
|
||||||
|
const (
|
||||||
|
structuredOutputsState_None structuredOutputsState = iota
|
||||||
|
structuredOutputsState_ReadyToApply
|
||||||
|
structuredOutputsState_Applying
|
||||||
|
)
|
||||||
|
|
||||||
ch := make(chan any)
|
ch := make(chan any)
|
||||||
go func() {
|
go func() {
|
||||||
defer close(ch)
|
defer close(ch)
|
||||||
|
|
||||||
if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
|
structuredOutputsState := structuredOutputsState_None
|
||||||
|
|
||||||
|
for {
|
||||||
|
var tb strings.Builder
|
||||||
|
|
||||||
|
currentFormat := req.Format
|
||||||
|
// structured outputs via double request is enabled when:
|
||||||
|
// 1. the model supports the thinking capability and
|
||||||
|
// 2. it uses a built-in parser or our generic thinking parser
|
||||||
|
|
||||||
|
// Note that the current approach does not work for (potential future)
|
||||||
|
// non-thinking models that emit anything before actual content. This
|
||||||
|
// current approach uses the transition from parsed thinking content to
|
||||||
|
// parsed non-thinking content as the signal to turn constraining on
|
||||||
|
|
||||||
|
if req.Format != nil && structuredOutputsState == structuredOutputsState_None && ((builtinParser != nil || thinkingState != nil) && slices.Contains(m.Capabilities(), model.CapabilityThinking)) {
|
||||||
|
currentFormat = nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// sets up new context given parent context per request
|
||||||
|
ctx, cancel := context.WithCancel(c.Request.Context())
|
||||||
|
err := r.Completion(ctx, llm.CompletionRequest{
|
||||||
Prompt: prompt,
|
Prompt: prompt,
|
||||||
Images: images,
|
Images: images,
|
||||||
Format: req.Format,
|
Format: currentFormat,
|
||||||
Options: opts,
|
Options: opts,
|
||||||
}, func(r llm.CompletionResponse) {
|
}, func(r llm.CompletionResponse) {
|
||||||
res := api.ChatResponse{
|
res := api.ChatResponse{
|
||||||
@@ -2020,13 +2048,20 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
|||||||
res.Message.Thinking = thinking
|
res.Message.Thinking = thinking
|
||||||
res.Message.ToolCalls = toolCalls
|
res.Message.ToolCalls = toolCalls
|
||||||
|
|
||||||
|
tb.WriteString(thinking)
|
||||||
|
// we are now receiving content from the model - we should start applying structured outputs
|
||||||
|
if structuredOutputsState == structuredOutputsState_None && req.Format != nil && tb.String() != "" && res.Message.Content != "" {
|
||||||
|
structuredOutputsState = structuredOutputsState_ReadyToApply
|
||||||
|
cancel()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
if res.Message.Content != "" || res.Message.Thinking != "" || len(res.Message.ToolCalls) > 0 || r.Done {
|
if res.Message.Content != "" || res.Message.Thinking != "" || len(res.Message.ToolCalls) > 0 || r.Done {
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "builtin parser output", "parser", m.Config.Parser, "content", content, "thinking", thinking, "toolCalls", toolCalls, "done", r.Done)
|
slog.Log(context.TODO(), logutil.LevelTrace, "builtin parser output", "parser", m.Config.Parser, "content", content, "thinking", thinking, "toolCalls", toolCalls, "done", r.Done)
|
||||||
ch <- res
|
ch <- res
|
||||||
} else {
|
} else {
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "builtin parser empty output", "parser", m.Config.Parser)
|
slog.Log(context.TODO(), logutil.LevelTrace, "builtin parser empty output", "parser", m.Config.Parser)
|
||||||
}
|
}
|
||||||
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2036,8 +2071,18 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
|||||||
// need to accumulate more to decide what to send
|
// need to accumulate more to decide what to send
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
res.Message.Content = remainingContent
|
|
||||||
res.Message.Thinking = thinkingContent
|
res.Message.Thinking = thinkingContent
|
||||||
|
tb.WriteString(thinkingContent)
|
||||||
|
// emit the collected thinking text before restarting with structured outputs and clear unstructured content
|
||||||
|
// to avoid leaking mixed tokens like "</think>Hello"
|
||||||
|
if structuredOutputsState == structuredOutputsState_None && req.Format != nil && tb.String() != "" && remainingContent != "" {
|
||||||
|
structuredOutputsState = structuredOutputsState_ReadyToApply
|
||||||
|
res.Message.Content = ""
|
||||||
|
ch <- res
|
||||||
|
cancel()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
res.Message.Content = remainingContent
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(req.Tools) > 0 {
|
if len(req.Tools) > 0 {
|
||||||
@@ -2059,8 +2104,42 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ch <- res
|
ch <- res
|
||||||
}); err != nil {
|
})
|
||||||
|
if err != nil {
|
||||||
|
if structuredOutputsState == structuredOutputsState_ReadyToApply && strings.Contains(err.Error(), "context canceled") && c.Request.Context().Err() == nil {
|
||||||
|
// only ignores error if it's a context cancellation due to setting structured outputs
|
||||||
|
} else {
|
||||||
ch <- gin.H{"error": err.Error()}
|
ch <- gin.H{"error": err.Error()}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ignored structured outputs cancellation falls through to here, start a new request with the structured outputs and updated prompt. use the
|
||||||
|
if structuredOutputsState == structuredOutputsState_ReadyToApply {
|
||||||
|
structuredOutputsState = structuredOutputsState_Applying
|
||||||
|
msg := api.Message{
|
||||||
|
Role: "assistant",
|
||||||
|
Thinking: tb.String(),
|
||||||
|
}
|
||||||
|
|
||||||
|
msgs = append(msgs, msg)
|
||||||
|
prompt, _, err = chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, processedTools, req.Think)
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("chat prompt error applying structured outputs", "error", err)
|
||||||
|
ch <- gin.H{"error": err.Error()}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// force constraining by terminating thinking header, the parser is already at this state
|
||||||
|
// when the last message is thinking, the rendered for gpt-oss cannot disambiguate between having the
|
||||||
|
// model continue thinking or ending thinking and outputting the final message.
|
||||||
|
// TODO(parthsareen): consider adding prefill disambiguation logic to the renderer for structured outputs.
|
||||||
|
if shouldUseHarmony(m) || (builtinParser != nil && m.Config.Parser == "harmony") {
|
||||||
|
prompt += "<|end|><|start|>assistant<|channel|>final<|message|>"
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
break
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
|||||||
@@ -1191,4 +1191,238 @@ func TestChatWithPromptEndingInThinkTag(t *testing.T) {
|
|||||||
t.Errorf("expected content %q, got %q", "Based on my analysis, the solution is straightforward.", got)
|
t.Errorf("expected content %q, got %q", "Based on my analysis, the solution is straightforward.", got)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
t.Run("structured outputs restart non-stream", func(t *testing.T) {
|
||||||
|
var (
|
||||||
|
requestsMu sync.Mutex
|
||||||
|
requests []llm.CompletionRequest
|
||||||
|
wg sync.WaitGroup
|
||||||
|
)
|
||||||
|
|
||||||
|
wg.Add(2)
|
||||||
|
|
||||||
|
format := json.RawMessage(`{"type":"object","properties":{"answer":{"type":"string"}}}`)
|
||||||
|
|
||||||
|
mock.CompletionFn = func(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
|
||||||
|
defer wg.Done()
|
||||||
|
|
||||||
|
requestsMu.Lock()
|
||||||
|
requests = append(requests, r)
|
||||||
|
callNum := len(requests)
|
||||||
|
requestsMu.Unlock()
|
||||||
|
|
||||||
|
switch callNum {
|
||||||
|
case 1:
|
||||||
|
fn(llm.CompletionResponse{
|
||||||
|
Content: " I am thinking through this problem. </think> {\"answer\":\"42\"}",
|
||||||
|
Done: false,
|
||||||
|
PromptEvalCount: 1,
|
||||||
|
PromptEvalDuration: 1,
|
||||||
|
})
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return ctx.Err()
|
||||||
|
case <-time.After(time.Second):
|
||||||
|
t.Fatalf("timeout waiting for structured outputs cancellation")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
case 2:
|
||||||
|
fn(llm.CompletionResponse{
|
||||||
|
Content: `{"answer":"42"}`,
|
||||||
|
Done: true,
|
||||||
|
DoneReason: llm.DoneReasonStop,
|
||||||
|
PromptEvalCount: 1,
|
||||||
|
PromptEvalDuration: 1,
|
||||||
|
EvalCount: 1,
|
||||||
|
EvalDuration: 1,
|
||||||
|
})
|
||||||
|
return nil
|
||||||
|
default:
|
||||||
|
t.Fatalf("unexpected number of completion calls: %d", callNum)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
think := true
|
||||||
|
streamRequest := false
|
||||||
|
w := createRequest(t, s.ChatHandler, api.ChatRequest{
|
||||||
|
Model: "test-thinking",
|
||||||
|
Messages: []api.Message{{Role: "user", Content: "Please respond in JSON."}},
|
||||||
|
Think: &api.ThinkValue{Value: think},
|
||||||
|
Stream: &streamRequest,
|
||||||
|
Format: format,
|
||||||
|
})
|
||||||
|
|
||||||
|
wg.Wait()
|
||||||
|
mock.CompletionFn = nil
|
||||||
|
|
||||||
|
if w.Code != http.StatusOK {
|
||||||
|
t.Fatalf("expected status 200, got %d", w.Code)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(requests) != 2 {
|
||||||
|
t.Fatalf("expected two completion calls, got %d", len(requests))
|
||||||
|
}
|
||||||
|
|
||||||
|
if requests[0].Format != nil {
|
||||||
|
t.Errorf("expected first completion format to be nil, got %q", requests[0].Format)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !bytes.Equal([]byte(format), []byte(requests[1].Format)) {
|
||||||
|
t.Errorf("expected second completion format to match original format")
|
||||||
|
}
|
||||||
|
|
||||||
|
var resp api.ChatResponse
|
||||||
|
if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if resp.Message.Thinking != "I am thinking through this problem. " {
|
||||||
|
t.Errorf("expected thinking %q, got %q", "I am thinking through this problem. ", resp.Message.Thinking)
|
||||||
|
}
|
||||||
|
|
||||||
|
if resp.Message.Content != `{"answer":"42"}` {
|
||||||
|
t.Errorf("expected content %q, got %q", `{"answer":"42"}`, resp.Message.Content)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !resp.Done {
|
||||||
|
t.Errorf("expected response to be done")
|
||||||
|
}
|
||||||
|
|
||||||
|
if resp.DoneReason != "stop" {
|
||||||
|
t.Errorf("expected done reason stop, got %s", resp.DoneReason)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("structured outputs restart streaming", func(t *testing.T) {
|
||||||
|
var (
|
||||||
|
requestsMu sync.Mutex
|
||||||
|
requests []llm.CompletionRequest
|
||||||
|
wg sync.WaitGroup
|
||||||
|
)
|
||||||
|
|
||||||
|
wg.Add(2)
|
||||||
|
|
||||||
|
format := json.RawMessage(`{"type":"object","properties":{"answer":{"type":"string"}}}`)
|
||||||
|
|
||||||
|
mock.CompletionFn = func(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
|
||||||
|
defer wg.Done()
|
||||||
|
|
||||||
|
requestsMu.Lock()
|
||||||
|
requests = append(requests, r)
|
||||||
|
callNum := len(requests)
|
||||||
|
requestsMu.Unlock()
|
||||||
|
|
||||||
|
switch callNum {
|
||||||
|
case 1:
|
||||||
|
fn(llm.CompletionResponse{
|
||||||
|
Content: " I am thinking through this problem. </think> {\"answer\":\"42\"}",
|
||||||
|
Done: false,
|
||||||
|
PromptEvalCount: 1,
|
||||||
|
PromptEvalDuration: 1,
|
||||||
|
})
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return ctx.Err()
|
||||||
|
case <-time.After(time.Second):
|
||||||
|
t.Fatalf("timeout waiting for structured outputs cancellation")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
case 2:
|
||||||
|
fn(llm.CompletionResponse{
|
||||||
|
Content: `{"answer":"42"}`,
|
||||||
|
Done: true,
|
||||||
|
DoneReason: llm.DoneReasonStop,
|
||||||
|
PromptEvalCount: 1,
|
||||||
|
PromptEvalDuration: 1,
|
||||||
|
EvalCount: 1,
|
||||||
|
EvalDuration: 1,
|
||||||
|
})
|
||||||
|
return nil
|
||||||
|
default:
|
||||||
|
t.Fatalf("unexpected number of completion calls: %d", callNum)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
think := true
|
||||||
|
streamRequest := true
|
||||||
|
w := createRequest(t, s.ChatHandler, api.ChatRequest{
|
||||||
|
Model: "test-thinking",
|
||||||
|
Messages: []api.Message{{Role: "user", Content: "Please respond in JSON."}},
|
||||||
|
Think: &api.ThinkValue{Value: think},
|
||||||
|
Stream: &streamRequest,
|
||||||
|
Format: format,
|
||||||
|
})
|
||||||
|
|
||||||
|
wg.Wait()
|
||||||
|
mock.CompletionFn = nil
|
||||||
|
|
||||||
|
if w.Code != http.StatusOK {
|
||||||
|
t.Fatalf("expected status 200, got %d", w.Code)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(requests) != 2 {
|
||||||
|
t.Fatalf("expected two completion calls, got %d", len(requests))
|
||||||
|
}
|
||||||
|
|
||||||
|
if requests[0].Format != nil {
|
||||||
|
t.Errorf("expected first completion format to be nil, got %q", requests[0].Format)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !bytes.Equal([]byte(format), []byte(requests[1].Format)) {
|
||||||
|
t.Errorf("expected second completion format to match original format")
|
||||||
|
}
|
||||||
|
|
||||||
|
decoder := json.NewDecoder(w.Body)
|
||||||
|
var events []api.ChatResponse
|
||||||
|
for {
|
||||||
|
var event api.ChatResponse
|
||||||
|
if err := decoder.Decode(&event); err == io.EOF {
|
||||||
|
break
|
||||||
|
} else if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
events = append(events, event)
|
||||||
|
if event.Done {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(events) < 2 {
|
||||||
|
t.Fatalf("expected at least two streaming events, got %d", len(events))
|
||||||
|
}
|
||||||
|
|
||||||
|
first := events[0]
|
||||||
|
if first.Message.Thinking != "I am thinking through this problem. " {
|
||||||
|
t.Errorf("expected first event thinking %q, got %q", "I am thinking through this problem. ", first.Message.Thinking)
|
||||||
|
}
|
||||||
|
|
||||||
|
if first.Message.Content != "" {
|
||||||
|
t.Errorf("expected first event content to be empty, got %q", first.Message.Content)
|
||||||
|
}
|
||||||
|
|
||||||
|
if first.Done {
|
||||||
|
t.Error("expected first event to be non-terminal")
|
||||||
|
}
|
||||||
|
|
||||||
|
last := events[len(events)-1]
|
||||||
|
if last.Message.Thinking != "" {
|
||||||
|
t.Errorf("expected final event thinking to be empty, got %q", last.Message.Thinking)
|
||||||
|
}
|
||||||
|
|
||||||
|
if last.Message.Content != `{"answer":"42"}` {
|
||||||
|
t.Errorf("expected final event content %q, got %q", `{"answer":"42"}`, last.Message.Content)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !last.Done {
|
||||||
|
t.Error("expected final event to be done")
|
||||||
|
}
|
||||||
|
|
||||||
|
if last.DoneReason != "stop" {
|
||||||
|
t.Errorf("expected final done reason stop, got %s", last.DoneReason)
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user