Revert "runner: move harmony to runner (#12052)"

This reverts commit 1a558f98e2.
This commit is contained in:
jmorganca
2025-09-12 13:32:30 -07:00
committed by Jeffrey Morgan
parent 9d56e63dbf
commit 92b96d54ef
6 changed files with 337 additions and 369 deletions

View File

@@ -1348,9 +1348,7 @@ type CompletionRequest struct {
Images []ImageData
Options *api.Options
Grammar string // set before sending the request to the subprocess
UseHarmony bool
PrefillString string
Grammar string // set before sending the request to the subprocess
}
// DoneReason represents the reason why a completion response is done
@@ -1363,8 +1361,6 @@ const (
DoneReasonLength
// DoneReasonConnectionClosed indicates the completion stopped due to the connection being closed
DoneReasonConnectionClosed
// DoneReasonTokenRepeatLimit indicates the completion stopped due to a token repeat limit
DoneReasonTokenRepeatLimit
)
func (d DoneReason) String() string {
@@ -1373,23 +1369,19 @@ func (d DoneReason) String() string {
return "length"
case DoneReasonStop:
return "stop"
case DoneReasonTokenRepeatLimit:
return "token_repeat_limit"
default:
return "" // closed
}
}
type CompletionResponse struct {
Content string `json:"content"`
Thinking string `json:"thinking"`
ToolCalls []api.ToolCall `json:"tool_calls"`
DoneReason DoneReason `json:"done_reason"`
Done bool `json:"done"`
PromptEvalCount int `json:"prompt_eval_count"`
PromptEvalDuration time.Duration `json:"prompt_eval_duration"`
EvalCount int `json:"eval_count"`
EvalDuration time.Duration `json:"eval_duration"`
Content string `json:"content"`
DoneReason DoneReason `json:"done_reason"`
Done bool `json:"done"`
PromptEvalCount int `json:"prompt_eval_count"`
PromptEvalDuration time.Duration `json:"prompt_eval_duration"`
EvalCount int `json:"eval_count"`
EvalDuration time.Duration `json:"eval_duration"`
}
func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
@@ -1507,8 +1499,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
return fmt.Errorf("error unmarshalling llm prediction response: %v", err)
}
switch {
// TODO(parthsareen): token repeat limit is now handled in the runner, this currently support legacy model and can be removed in the future
case strings.TrimSpace(c.Content) == lastToken && c.Content != "":
case strings.TrimSpace(c.Content) == lastToken:
tokenRepeat++
default:
lastToken = strings.TrimSpace(c.Content)
@@ -1521,14 +1512,16 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
return ctx.Err()
}
if c.Content != "" {
fn(CompletionResponse{
Content: c.Content,
})
}
if c.Done {
fn(c)
return nil
}
if c.Content != "" || c.Thinking != "" || len(c.ToolCalls) > 0 {
fn(c)
}
}
}