From 4be4dc8717589c859734371a727bb7bc911c7ed8 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Fri, 22 Aug 2025 12:00:16 -0700 Subject: [PATCH] server: skip parsing initial if provided in the prompt (#12024) --- server/routes.go | 4 + server/routes_generate_test.go | 230 +++++++++++++++++++++++++++++++++ 2 files changed, 234 insertions(+) diff --git a/server/routes.go b/server/routes.go index cc8913537e..ae1662148c 100644 --- a/server/routes.go +++ b/server/routes.go @@ -1673,6 +1673,10 @@ func (s *Server) ChatHandler(c *gin.Context) { OpeningTag: openingTag, ClosingTag: closingTag, } + + if strings.HasSuffix(strings.TrimSpace(prompt), openingTag) { + thinkingState.AddContent(openingTag) + } } var toolParser *tools.Parser diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go index a57975f16b..a3b83fc1a9 100644 --- a/server/routes_generate_test.go +++ b/server/routes_generate_test.go @@ -969,3 +969,233 @@ func TestGenerate(t *testing.T) { } }) } + +func TestChatWithPromptEndingInThinkTag(t *testing.T) { + gin.SetMode(gin.TestMode) + + // Helper to create a standard thinking test setup + setupThinkingTest := func(t *testing.T) (*mockRunner, *Server) { + mock := &mockRunner{ + CompletionResponse: llm.CompletionResponse{ + Done: true, + DoneReason: llm.DoneReasonStop, + PromptEvalCount: 1, + PromptEvalDuration: 1, + EvalCount: 1, + EvalDuration: 1, + }, + } + + s := &Server{ + sched: &Scheduler{ + pendingReqCh: make(chan *LlmRequest, 1), + finishedReqCh: make(chan *LlmRequest, 1), + expiredCh: make(chan *runnerRef, 1), + unloadedCh: make(chan any, 1), + loaded: make(map[string]*runnerRef), + newServerFn: newMockServer(mock), + getGpuFn: discover.GetGPUInfo, + getCpuFn: discover.GetCPUInfo, + reschedDelay: 250 * time.Millisecond, + loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool { + time.Sleep(time.Millisecond) + req.successCh <- &runnerRef{llama: mock} + return false + }, + }, + } + + go s.sched.Run(t.Context()) + + // Create a model with thinking support + _, digest := createBinFile(t, ggml.KV{ + "general.architecture": "llama", + "llama.block_count": uint32(1), + "llama.context_length": uint32(8192), + "llama.embedding_length": uint32(4096), + "llama.attention.head_count": uint32(32), + "llama.attention.head_count_kv": uint32(8), + "tokenizer.ggml.tokens": []string{""}, + "tokenizer.ggml.scores": []float32{0}, + "tokenizer.ggml.token_type": []int32{0}, + }, []*ggml.Tensor{ + {Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + }) + + // Create model with thinking template that adds at the end + w := createRequest(t, s.CreateHandler, api.CreateRequest{ + Model: "test-thinking", + Files: map[string]string{"file.gguf": digest}, + Template: `{{- range .Messages }} +{{- if eq .Role "user" }}user: {{ .Content }} +{{ else if eq .Role "assistant" }}assistant: {{ if .Thinking }}{{ .Thinking }}{{ end }}{{ .Content }} +{{ end }}{{ end }}`, + Stream: &stream, + }) + + if w.Code != http.StatusOK { + t.Fatalf("expected status 200, got %d", w.Code) + } + + return mock, s + } + + mock, s := setupThinkingTest(t) + + // Helper to test chat responses + testChatRequest := func(t *testing.T, name string, userContent string, modelResponse string, expectedThinking string, expectedContent string, think bool) { + t.Run(name, func(t *testing.T) { + mock.CompletionResponse = llm.CompletionResponse{ + Content: modelResponse, + Done: true, + DoneReason: llm.DoneReasonStop, + PromptEvalCount: 1, + PromptEvalDuration: 1, + EvalCount: 1, + EvalDuration: 1, + } + mock.CompletionFn = nil + + streamRequest := false + req := api.ChatRequest{ + Model: "test-thinking", + Messages: []api.Message{ + {Role: "user", Content: userContent}, + }, + Stream: &streamRequest, + } + if think { + req.Think = &api.ThinkValue{Value: think} + } + + w := createRequest(t, s.ChatHandler, req) + if w.Code != http.StatusOK { + t.Fatalf("expected status 200, got %d", w.Code) + } + + var resp api.ChatResponse + if err := json.NewDecoder(w.Body).Decode(&resp); err != nil { + t.Fatal(err) + } + + if resp.Message.Thinking != expectedThinking { + t.Errorf("expected thinking %q, got %q", expectedThinking, resp.Message.Thinking) + } + + if resp.Message.Content != expectedContent { + t.Errorf("expected content %q, got %q", expectedContent, resp.Message.Content) + } + }) + } + + // Test cases - Note: Template adds at the end, and leading whitespace after is eaten by the parser + testChatRequest(t, "basic thinking response", + "Help me solve this problem", + " Let me think about this step by step... The answer is 42.", + "Let me think about this step by step... ", + "The answer is 42.", + true) + + testChatRequest(t, "thinking with multiple sentences", + "Explain quantum computing", + " First, I need to understand the basics. Quantum bits can be in superposition. Quantum computing uses quantum mechanics principles.", + "First, I need to understand the basics. Quantum bits can be in superposition. ", + "Quantum computing uses quantum mechanics principles.", + true) + + testChatRequest(t, "no thinking content", + "What is 2+2?", + " The answer is 4.", + "", + "The answer is 4.", + true) + + testChatRequest(t, "thinking disabled but template still adds think tag", + "Simple question", + " My thoughts The answer.", + "", + " My thoughts The answer.", + false) + + // Test streaming response with template-added + t.Run("streaming with thinking", func(t *testing.T) { + var wg sync.WaitGroup + wg.Add(1) + + mock.CompletionFn = func(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error { + defer wg.Done() + + // Verify the prompt ends with due to template + if !strings.HasSuffix(r.Prompt, "") { + t.Errorf("expected prompt to end with , got: %q", r.Prompt) + } + + // Simulate streaming chunks + responses := []llm.CompletionResponse{ + {Content: " I need to consider", Done: false, PromptEvalCount: 1, PromptEvalDuration: 1}, + {Content: " multiple factors here...", Done: false, PromptEvalCount: 1, PromptEvalDuration: 1}, + {Content: " Based on my analysis,", Done: false, PromptEvalCount: 1, PromptEvalDuration: 1}, + {Content: " the solution is straightforward.", Done: true, DoneReason: llm.DoneReasonStop, PromptEvalCount: 1, PromptEvalDuration: 1, EvalCount: 1, EvalDuration: 1}, + } + + for _, resp := range responses { + select { + case <-ctx.Done(): + return ctx.Err() + default: + fn(resp) + time.Sleep(10 * time.Millisecond) + } + } + return nil + } + + think := true + w := createRequest(t, s.ChatHandler, api.ChatRequest{ + Model: "test-thinking", + Messages: []api.Message{{Role: "user", Content: "Analyze this complex problem"}}, + Think: &api.ThinkValue{Value: think}, + Stream: &stream, + }) + + wg.Wait() + + if w.Code != http.StatusOK { + t.Fatalf("expected status 200, got %d", w.Code) + } + + // Parse streaming responses + decoder := json.NewDecoder(w.Body) + var allThinking, allContent strings.Builder + + for { + var resp api.ChatResponse + if err := decoder.Decode(&resp); err == io.EOF { + break + } else if err != nil { + t.Fatal(err) + } + allThinking.WriteString(resp.Message.Thinking) + allContent.WriteString(resp.Message.Content) + } + + // Note: Leading whitespace after is eaten by the parser + if got := allThinking.String(); got != "I need to consider multiple factors here... " { + t.Errorf("expected thinking %q, got %q", "I need to consider multiple factors here... ", got) + } + + if got := allContent.String(); got != "Based on my analysis, the solution is straightforward." { + t.Errorf("expected content %q, got %q", "Based on my analysis, the solution is straightforward.", got) + } + }) +}