routes: structured outputs for gpt-oss (#12460)

2025-11-10 22:28:43 +01:00 · 2025-10-08 19:13:38 -07:00
parent 1b91d4dda1
commit 77060d462c
2 changed files with 377 additions and 64 deletions
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -1191,4 +1191,238 @@ func TestChatWithPromptEndingInThinkTag(t *testing.T) {
 			t.Errorf("expected content %q, got %q", "Based on my analysis, the solution is straightforward.", got)
 		}
 	})
+
+	t.Run("structured outputs restart non-stream", func(t *testing.T) {
+		var (
+			requestsMu sync.Mutex
+			requests   []llm.CompletionRequest
+			wg         sync.WaitGroup
+		)
+
+		wg.Add(2)
+
+		format := json.RawMessage(`{"type":"object","properties":{"answer":{"type":"string"}}}`)
+
+		mock.CompletionFn = func(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
+			defer wg.Done()
+
+			requestsMu.Lock()
+			requests = append(requests, r)
+			callNum := len(requests)
+			requestsMu.Unlock()
+
+			switch callNum {
+			case 1:
+				fn(llm.CompletionResponse{
+					Content:            " I am thinking through this problem. </think> {\"answer\":\"42\"}",
+					Done:               false,
+					PromptEvalCount:    1,
+					PromptEvalDuration: 1,
+				})
+
+				select {
+				case <-ctx.Done():
+					return ctx.Err()
+				case <-time.After(time.Second):
+					t.Fatalf("timeout waiting for structured outputs cancellation")
+					return nil
+				}
+			case 2:
+				fn(llm.CompletionResponse{
+					Content:            `{"answer":"42"}`,
+					Done:               true,
+					DoneReason:         llm.DoneReasonStop,
+					PromptEvalCount:    1,
+					PromptEvalDuration: 1,
+					EvalCount:          1,
+					EvalDuration:       1,
+				})
+				return nil
+			default:
+				t.Fatalf("unexpected number of completion calls: %d", callNum)
+				return nil
+			}
+		}
+
+		think := true
+		streamRequest := false
+		w := createRequest(t, s.ChatHandler, api.ChatRequest{
+			Model:    "test-thinking",
+			Messages: []api.Message{{Role: "user", Content: "Please respond in JSON."}},
+			Think:    &api.ThinkValue{Value: think},
+			Stream:   &streamRequest,
+			Format:   format,
+		})
+
+		wg.Wait()
+		mock.CompletionFn = nil
+
+		if w.Code != http.StatusOK {
+			t.Fatalf("expected status 200, got %d", w.Code)
+		}
+
+		if len(requests) != 2 {
+			t.Fatalf("expected two completion calls, got %d", len(requests))
+		}
+
+		if requests[0].Format != nil {
+			t.Errorf("expected first completion format to be nil, got %q", requests[0].Format)
+		}
+
+		if !bytes.Equal([]byte(format), []byte(requests[1].Format)) {
+			t.Errorf("expected second completion format to match original format")
+		}
+
+		var resp api.ChatResponse
+		if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
+			t.Fatal(err)
+		}
+
+		if resp.Message.Thinking != "I am thinking through this problem. " {
+			t.Errorf("expected thinking %q, got %q", "I am thinking through this problem. ", resp.Message.Thinking)
+		}
+
+		if resp.Message.Content != `{"answer":"42"}` {
+			t.Errorf("expected content %q, got %q", `{"answer":"42"}`, resp.Message.Content)
+		}
+
+		if !resp.Done {
+			t.Errorf("expected response to be done")
+		}
+
+		if resp.DoneReason != "stop" {
+			t.Errorf("expected done reason stop, got %s", resp.DoneReason)
+		}
+	})
+
+	t.Run("structured outputs restart streaming", func(t *testing.T) {
+		var (
+			requestsMu sync.Mutex
+			requests   []llm.CompletionRequest
+			wg         sync.WaitGroup
+		)
+
+		wg.Add(2)
+
+		format := json.RawMessage(`{"type":"object","properties":{"answer":{"type":"string"}}}`)
+
+		mock.CompletionFn = func(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
+			defer wg.Done()
+
+			requestsMu.Lock()
+			requests = append(requests, r)
+			callNum := len(requests)
+			requestsMu.Unlock()
+
+			switch callNum {
+			case 1:
+				fn(llm.CompletionResponse{
+					Content:            " I am thinking through this problem. </think> {\"answer\":\"42\"}",
+					Done:               false,
+					PromptEvalCount:    1,
+					PromptEvalDuration: 1,
+				})
+
+				select {
+				case <-ctx.Done():
+					return ctx.Err()
+				case <-time.After(time.Second):
+					t.Fatalf("timeout waiting for structured outputs cancellation")
+					return nil
+				}
+			case 2:
+				fn(llm.CompletionResponse{
+					Content:            `{"answer":"42"}`,
+					Done:               true,
+					DoneReason:         llm.DoneReasonStop,
+					PromptEvalCount:    1,
+					PromptEvalDuration: 1,
+					EvalCount:          1,
+					EvalDuration:       1,
+				})
+				return nil
+			default:
+				t.Fatalf("unexpected number of completion calls: %d", callNum)
+				return nil
+			}
+		}
+
+		think := true
+		streamRequest := true
+		w := createRequest(t, s.ChatHandler, api.ChatRequest{
+			Model:    "test-thinking",
+			Messages: []api.Message{{Role: "user", Content: "Please respond in JSON."}},
+			Think:    &api.ThinkValue{Value: think},
+			Stream:   &streamRequest,
+			Format:   format,
+		})
+
+		wg.Wait()
+		mock.CompletionFn = nil
+
+		if w.Code != http.StatusOK {
+			t.Fatalf("expected status 200, got %d", w.Code)
+		}
+
+		if len(requests) != 2 {
+			t.Fatalf("expected two completion calls, got %d", len(requests))
+		}
+
+		if requests[0].Format != nil {
+			t.Errorf("expected first completion format to be nil, got %q", requests[0].Format)
+		}
+
+		if !bytes.Equal([]byte(format), []byte(requests[1].Format)) {
+			t.Errorf("expected second completion format to match original format")
+		}
+
+		decoder := json.NewDecoder(w.Body)
+		var events []api.ChatResponse
+		for {
+			var event api.ChatResponse
+			if err := decoder.Decode(&event); err == io.EOF {
+				break
+			} else if err != nil {
+				t.Fatal(err)
+			}
+			events = append(events, event)
+			if event.Done {
+				break
+			}
+		}
+
+		if len(events) < 2 {
+			t.Fatalf("expected at least two streaming events, got %d", len(events))
+		}
+
+		first := events[0]
+		if first.Message.Thinking != "I am thinking through this problem. " {
+			t.Errorf("expected first event thinking %q, got %q", "I am thinking through this problem. ", first.Message.Thinking)
+		}
+
+		if first.Message.Content != "" {
+			t.Errorf("expected first event content to be empty, got %q", first.Message.Content)
+		}
+
+		if first.Done {
+			t.Error("expected first event to be non-terminal")
+		}
+
+		last := events[len(events)-1]
+		if last.Message.Thinking != "" {
+			t.Errorf("expected final event thinking to be empty, got %q", last.Message.Thinking)
+		}
+
+		if last.Message.Content != `{"answer":"42"}` {
+			t.Errorf("expected final event content %q, got %q", `{"answer":"42"}`, last.Message.Content)
+		}
+
+		if !last.Done {
+			t.Error("expected final event to be done")
+		}
+
+		if last.DoneReason != "stop" {
+			t.Errorf("expected final done reason stop, got %s", last.DoneReason)
+		}
+	})
 }