mirror of
https://github.com/ollama/ollama.git
synced 2025-11-11 10:47:13 +01:00
Reapply "add truncate and shift parameters" (#12582)
This commit is contained in:
16
api/types.go
16
api/types.go
@@ -106,6 +106,14 @@ type GenerateRequest struct {
|
||||
// before this option was introduced)
|
||||
Think *ThinkValue `json:"think,omitempty"`
|
||||
|
||||
// Truncate is a boolean that, when set to true, truncates the chat history messages
|
||||
// if the rendered prompt exceeds the context length limit.
|
||||
Truncate *bool `json:"truncate,omitempty"`
|
||||
|
||||
// Shift is a boolean that, when set to true, shifts the chat history
|
||||
// when hitting the context length limit instead of erroring.
|
||||
Shift *bool `json:"shift,omitempty"`
|
||||
|
||||
// DebugRenderOnly is a debug option that, when set to true, returns the rendered
|
||||
// template instead of calling the model.
|
||||
DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
|
||||
@@ -140,6 +148,14 @@ type ChatRequest struct {
|
||||
// for supported models.
|
||||
Think *ThinkValue `json:"think,omitempty"`
|
||||
|
||||
// Truncate is a boolean that, when set to true, truncates the chat history messages
|
||||
// if the rendered prompt exceeds the context length limit.
|
||||
Truncate *bool `json:"truncate,omitempty"`
|
||||
|
||||
// Shift is a boolean that, when set to true, shifts the chat history
|
||||
// when hitting the context length limit instead of erroring.
|
||||
Shift *bool `json:"shift,omitempty"`
|
||||
|
||||
// DebugRenderOnly is a debug option that, when set to true, returns the rendered
|
||||
// template instead of calling the model.
|
||||
DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
|
||||
|
||||
@@ -1380,6 +1380,8 @@ type CompletionRequest struct {
|
||||
Options *api.Options
|
||||
|
||||
Grammar string // set before sending the request to the subprocess
|
||||
Shift bool
|
||||
Truncate bool
|
||||
}
|
||||
|
||||
// DoneReason represents the reason why a completion response is done
|
||||
@@ -1501,7 +1503,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
|
||||
return fmt.Errorf("failed reading llm error response: %w", err)
|
||||
}
|
||||
log.Printf("llm predict error: %s", bodyBytes)
|
||||
return fmt.Errorf("%s", bodyBytes)
|
||||
return api.StatusError{StatusCode: res.StatusCode, ErrorMessage: strings.TrimSpace(string(bodyBytes))}
|
||||
}
|
||||
|
||||
scanner := bufio.NewScanner(res.Body)
|
||||
|
||||
@@ -79,6 +79,9 @@ type Sequence struct {
|
||||
// true if an embedding are to be returned instead of text generation
|
||||
embeddingOnly bool
|
||||
|
||||
// shift if context window is exceeded
|
||||
shift bool
|
||||
|
||||
doneReason llm.DoneReason
|
||||
|
||||
// Metrics
|
||||
@@ -94,8 +97,12 @@ type NewSequenceParams struct {
|
||||
numKeep int
|
||||
samplingParams *llama.SamplingParams
|
||||
embedding bool
|
||||
shift bool
|
||||
truncate bool
|
||||
}
|
||||
|
||||
var errorInputTooLong = errors.New("the input length exceeds the context length")
|
||||
|
||||
func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSequenceParams) (*Sequence, error) {
|
||||
s.ready.Wait()
|
||||
|
||||
@@ -119,6 +126,10 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
|
||||
|
||||
if len(inputs) > s.cache.numCtx {
|
||||
discard := len(inputs) - s.cache.numCtx
|
||||
if !params.truncate {
|
||||
return nil, errorInputTooLong
|
||||
}
|
||||
|
||||
newInputs := inputs[:params.numKeep]
|
||||
newInputs = append(newInputs, inputs[params.numKeep+discard:]...)
|
||||
|
||||
@@ -385,6 +396,11 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
||||
for i, input := range seq.inputs {
|
||||
if len(seq.cache.Inputs)+len(seq.pendingInputs)+1 > s.cache.numCtx {
|
||||
if len(seq.pendingInputs) == 0 {
|
||||
if !seq.shift {
|
||||
s.removeSequence(seqIdx, llm.DoneReasonLength)
|
||||
break
|
||||
}
|
||||
|
||||
err := s.cache.ShiftCacheSlot(seq.cache, seq.numKeep)
|
||||
if err != nil {
|
||||
var reprocess *ErrReprocessInputs
|
||||
@@ -583,8 +599,14 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
|
||||
numKeep: req.Options.NumKeep,
|
||||
samplingParams: &samplingParams,
|
||||
embedding: false,
|
||||
shift: req.Shift,
|
||||
truncate: req.Truncate,
|
||||
})
|
||||
if err != nil {
|
||||
if errors.Is(err, errorInputTooLong) {
|
||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -88,6 +88,9 @@ type Sequence struct {
|
||||
// true if an embedding are to be returned instead of text generation
|
||||
embeddingOnly bool
|
||||
|
||||
// shift if context window is exceeded
|
||||
shift bool
|
||||
|
||||
doneReason llm.DoneReason
|
||||
|
||||
// Metrics
|
||||
@@ -104,8 +107,12 @@ type NewSequenceParams struct {
|
||||
numKeep int32
|
||||
sampler sample.Sampler
|
||||
embedding bool
|
||||
shift bool
|
||||
truncate bool
|
||||
}
|
||||
|
||||
var errorInputTooLong = errors.New("the input length exceeds the context length")
|
||||
|
||||
func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSequenceParams) (*Sequence, error) {
|
||||
s.ready.Wait()
|
||||
|
||||
@@ -125,6 +132,11 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
|
||||
|
||||
if int32(len(inputs)) > s.cache.numCtx {
|
||||
discard := int32(len(inputs)) - s.cache.numCtx
|
||||
|
||||
if !params.truncate {
|
||||
return nil, errorInputTooLong
|
||||
}
|
||||
|
||||
promptStart := params.numKeep + discard
|
||||
|
||||
// If we need to truncate in the middle of a unbreakable batch, remove the entire batch
|
||||
@@ -176,6 +188,7 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
|
||||
embeddingOnly: params.embedding,
|
||||
stop: params.stop,
|
||||
numKeep: params.numKeep,
|
||||
shift: params.shift,
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -517,6 +530,12 @@ func (s *Server) forwardBatch(pendingBatch batchState) (nextBatch batchState, er
|
||||
break
|
||||
}
|
||||
|
||||
if !seq.shift {
|
||||
s.removeSequence(seqIdx, llm.DoneReasonLength)
|
||||
nextBatch.seqs[seqIdx] = nil
|
||||
break
|
||||
}
|
||||
|
||||
err = s.cache.ShiftCacheSlot(seq.cache, seq.numKeep)
|
||||
if err != nil {
|
||||
var reprocess *ErrReprocessInputs
|
||||
@@ -832,8 +851,14 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
|
||||
numKeep: int32(req.Options.NumKeep),
|
||||
sampler: sampler,
|
||||
embedding: false,
|
||||
shift: req.Shift,
|
||||
truncate: req.Truncate,
|
||||
})
|
||||
if err != nil {
|
||||
if errors.Is(err, errorInputTooLong) {
|
||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -20,7 +20,7 @@ type tokenizeFunc func(context.Context, string) ([]int, error)
|
||||
// chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn.
|
||||
// chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the
|
||||
// latest message and 2) system messages
|
||||
func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool, think *api.ThinkValue) (prompt string, images []llm.ImageData, _ error) {
|
||||
func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool, think *api.ThinkValue, truncate bool) (prompt string, images []llm.ImageData, _ error) {
|
||||
var system []api.Message
|
||||
|
||||
// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent
|
||||
@@ -59,7 +59,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
||||
}
|
||||
}
|
||||
|
||||
if ctxLen > opts.NumCtx {
|
||||
if truncate && ctxLen > opts.NumCtx {
|
||||
slog.Debug("truncating input messages which exceed context length", "truncated", len(msgs[i:]))
|
||||
break
|
||||
} else {
|
||||
|
||||
@@ -30,6 +30,7 @@ func TestChatPrompt(t *testing.T) {
|
||||
name string
|
||||
model Model
|
||||
limit int
|
||||
truncate bool
|
||||
msgs []api.Message
|
||||
expect
|
||||
}{
|
||||
@@ -37,6 +38,7 @@ func TestChatPrompt(t *testing.T) {
|
||||
name: "messages",
|
||||
model: visionModel,
|
||||
limit: 64,
|
||||
truncate: true,
|
||||
msgs: []api.Message{
|
||||
{Role: "user", Content: "You're a test, Harry!"},
|
||||
{Role: "assistant", Content: "I-I'm a what?"},
|
||||
@@ -50,6 +52,7 @@ func TestChatPrompt(t *testing.T) {
|
||||
name: "truncate messages",
|
||||
model: visionModel,
|
||||
limit: 1,
|
||||
truncate: true,
|
||||
msgs: []api.Message{
|
||||
{Role: "user", Content: "You're a test, Harry!"},
|
||||
{Role: "assistant", Content: "I-I'm a what?"},
|
||||
@@ -63,6 +66,7 @@ func TestChatPrompt(t *testing.T) {
|
||||
name: "truncate messages with image",
|
||||
model: visionModel,
|
||||
limit: 64,
|
||||
truncate: true,
|
||||
msgs: []api.Message{
|
||||
{Role: "user", Content: "You're a test, Harry!"},
|
||||
{Role: "assistant", Content: "I-I'm a what?"},
|
||||
@@ -79,6 +83,7 @@ func TestChatPrompt(t *testing.T) {
|
||||
name: "truncate messages with images",
|
||||
model: visionModel,
|
||||
limit: 64,
|
||||
truncate: true,
|
||||
msgs: []api.Message{
|
||||
{Role: "user", Content: "You're a test, Harry!", Images: []api.ImageData{[]byte("something")}},
|
||||
{Role: "assistant", Content: "I-I'm a what?"},
|
||||
@@ -95,6 +100,7 @@ func TestChatPrompt(t *testing.T) {
|
||||
name: "messages with images",
|
||||
model: visionModel,
|
||||
limit: 2048,
|
||||
truncate: true,
|
||||
msgs: []api.Message{
|
||||
{Role: "user", Content: "You're a test, Harry!", Images: []api.ImageData{[]byte("something")}},
|
||||
{Role: "assistant", Content: "I-I'm a what?"},
|
||||
@@ -112,6 +118,7 @@ func TestChatPrompt(t *testing.T) {
|
||||
name: "message with image tag",
|
||||
model: visionModel,
|
||||
limit: 2048,
|
||||
truncate: true,
|
||||
msgs: []api.Message{
|
||||
{Role: "user", Content: "You're a test, Harry! [img]", Images: []api.ImageData{[]byte("something")}},
|
||||
{Role: "assistant", Content: "I-I'm a what?"},
|
||||
@@ -129,6 +136,7 @@ func TestChatPrompt(t *testing.T) {
|
||||
name: "messages with interleaved images",
|
||||
model: visionModel,
|
||||
limit: 2048,
|
||||
truncate: true,
|
||||
msgs: []api.Message{
|
||||
{Role: "user", Content: "You're a test, Harry!"},
|
||||
{Role: "user", Images: []api.ImageData{[]byte("something")}},
|
||||
@@ -148,6 +156,7 @@ func TestChatPrompt(t *testing.T) {
|
||||
name: "truncate message with interleaved images",
|
||||
model: visionModel,
|
||||
limit: 1024,
|
||||
truncate: true,
|
||||
msgs: []api.Message{
|
||||
{Role: "user", Content: "You're a test, Harry!"},
|
||||
{Role: "user", Images: []api.ImageData{[]byte("something")}},
|
||||
@@ -166,6 +175,7 @@ func TestChatPrompt(t *testing.T) {
|
||||
name: "message with system prompt",
|
||||
model: visionModel,
|
||||
limit: 2048,
|
||||
truncate: true,
|
||||
msgs: []api.Message{
|
||||
{Role: "system", Content: "You are the Test Who Lived."},
|
||||
{Role: "user", Content: "You're a test, Harry!"},
|
||||
@@ -180,6 +190,7 @@ func TestChatPrompt(t *testing.T) {
|
||||
name: "out of order system",
|
||||
model: visionModel,
|
||||
limit: 2048,
|
||||
truncate: true,
|
||||
msgs: []api.Message{
|
||||
{Role: "user", Content: "You're a test, Harry!"},
|
||||
{Role: "assistant", Content: "I-I'm a what?"},
|
||||
@@ -194,6 +205,7 @@ func TestChatPrompt(t *testing.T) {
|
||||
name: "multiple images same prompt",
|
||||
model: visionModel,
|
||||
limit: 2048,
|
||||
truncate: true,
|
||||
msgs: []api.Message{
|
||||
{Role: "user", Content: "Compare these two pictures of hotdogs", Images: []api.ImageData{[]byte("one hotdog"), []byte("two hotdogs")}},
|
||||
},
|
||||
@@ -202,6 +214,20 @@ func TestChatPrompt(t *testing.T) {
|
||||
images: [][]byte{[]byte("one hotdog"), []byte("two hotdogs")},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "no truncate with limit exceeded",
|
||||
model: visionModel,
|
||||
limit: 10,
|
||||
truncate: false,
|
||||
msgs: []api.Message{
|
||||
{Role: "user", Content: "You're a test, Harry!"},
|
||||
{Role: "assistant", Content: "I-I'm a what?"},
|
||||
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager."},
|
||||
},
|
||||
expect: expect{
|
||||
prompt: "You're a test, Harry! I-I'm a what? A test. And a thumping good one at that, I'd wager. ",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range cases {
|
||||
@@ -209,7 +235,7 @@ func TestChatPrompt(t *testing.T) {
|
||||
model := tt.model
|
||||
opts := api.Options{Runner: api.Runner{NumCtx: tt.limit}}
|
||||
think := false
|
||||
prompt, images, err := chatPrompt(t.Context(), &model, mockRunner{}.Tokenize, &opts, tt.msgs, nil, &api.ThinkValue{Value: think})
|
||||
prompt, images, err := chatPrompt(t.Context(), &model, mockRunner{}.Tokenize, &opts, tt.msgs, nil, &api.ThinkValue{Value: think}, tt.truncate)
|
||||
if tt.error == nil && err != nil {
|
||||
t.Fatal(err)
|
||||
} else if tt.error != nil && err != tt.error {
|
||||
|
||||
@@ -434,7 +434,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
// the real chat handler, but doing this as a stopgap to get renderer
|
||||
// support for generate
|
||||
if values.Messages != nil && values.Suffix == "" && req.Template == "" {
|
||||
prompt, images, err = chatPrompt(c.Request.Context(), m, r.Tokenize, opts, values.Messages, []api.Tool{}, req.Think)
|
||||
prompt, images, err = chatPrompt(c.Request.Context(), m, r.Tokenize, opts, values.Messages, []api.Tool{}, req.Think, req.Truncate == nil || *req.Truncate)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
@@ -492,6 +492,8 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
Images: images,
|
||||
Format: req.Format,
|
||||
Options: opts,
|
||||
Shift: req.Shift == nil || *req.Shift,
|
||||
Truncate: req.Truncate == nil || *req.Truncate,
|
||||
}, func(cr llm.CompletionResponse) {
|
||||
res := api.GenerateResponse{
|
||||
Model: req.Model,
|
||||
@@ -553,8 +555,13 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
|
||||
ch <- res
|
||||
}); err != nil {
|
||||
var serr api.StatusError
|
||||
if errors.As(err, &serr) {
|
||||
ch <- gin.H{"error": serr.ErrorMessage, "status": serr.StatusCode}
|
||||
} else {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
if req.Stream != nil && !*req.Stream {
|
||||
@@ -573,7 +580,12 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
msg = "unexpected error format in response"
|
||||
}
|
||||
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": msg})
|
||||
status, ok := t["status"].(int)
|
||||
if !ok {
|
||||
status = http.StatusInternalServerError
|
||||
}
|
||||
|
||||
c.JSON(status, gin.H{"error": msg})
|
||||
return
|
||||
default:
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected response"})
|
||||
@@ -1638,6 +1650,30 @@ func streamResponse(c *gin.Context, ch chan any) {
|
||||
return false
|
||||
}
|
||||
|
||||
// errors are provided as a gin.H with an "error" field and
|
||||
// an optional "status" field. For errors that are streamed
|
||||
// before any content, we need to set the status code and
|
||||
// content type for the error.
|
||||
if h, ok := val.(gin.H); ok {
|
||||
if e, ok := h["error"].(string); ok {
|
||||
status, ok := h["status"].(int)
|
||||
if !ok {
|
||||
status = http.StatusInternalServerError
|
||||
}
|
||||
|
||||
if !c.Writer.Written() {
|
||||
c.Header("Content-Type", "application/json")
|
||||
c.JSON(status, gin.H{"error": e})
|
||||
} else {
|
||||
if err := json.NewEncoder(c.Writer).Encode(gin.H{"error": e}); err != nil {
|
||||
slog.Error("streamResponse failed to encode json error", "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
bts, err := json.Marshal(val)
|
||||
if err != nil {
|
||||
slog.Info(fmt.Sprintf("streamResponse: json.Marshal failed with %s", err))
|
||||
@@ -1957,7 +1993,8 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, processedTools, req.Think)
|
||||
truncate := req.Truncate == nil || *req.Truncate
|
||||
prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, processedTools, req.Think, truncate)
|
||||
if err != nil {
|
||||
slog.Error("chat prompt error", "error", err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
@@ -2038,6 +2075,8 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
Images: images,
|
||||
Format: currentFormat,
|
||||
Options: opts,
|
||||
Shift: req.Shift == nil || *req.Shift,
|
||||
Truncate: truncate,
|
||||
}, func(r llm.CompletionResponse) {
|
||||
res := api.ChatResponse{
|
||||
Model: req.Model,
|
||||
@@ -2130,8 +2169,13 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
if err != nil {
|
||||
if structuredOutputsState == structuredOutputsState_ReadyToApply && strings.Contains(err.Error(), "context canceled") && c.Request.Context().Err() == nil {
|
||||
// only ignores error if it's a context cancellation due to setting structured outputs
|
||||
} else {
|
||||
var serr api.StatusError
|
||||
if errors.As(err, &serr) {
|
||||
ch <- gin.H{"error": serr.ErrorMessage, "status": serr.StatusCode}
|
||||
} else {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
@@ -2145,7 +2189,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
}
|
||||
|
||||
msgs = append(msgs, msg)
|
||||
prompt, _, err = chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, processedTools, req.Think)
|
||||
prompt, _, err = chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, processedTools, req.Think, truncate)
|
||||
if err != nil {
|
||||
slog.Error("chat prompt error applying structured outputs", "error", err)
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
@@ -2185,7 +2229,12 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
msg = "unexpected error format in response"
|
||||
}
|
||||
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": msg})
|
||||
status, ok := t["status"].(int)
|
||||
if !ok {
|
||||
status = http.StatusInternalServerError
|
||||
}
|
||||
|
||||
c.JSON(status, gin.H{"error": msg})
|
||||
return
|
||||
default:
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected response"})
|
||||
|
||||
@@ -609,6 +609,58 @@ func TestGenerateChat(t *testing.T) {
|
||||
t.Errorf("final tool call mismatch (-got +want):\n%s", diff)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("status error non-streaming", func(t *testing.T) {
|
||||
mock.CompletionFn = func(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
|
||||
return api.StatusError{
|
||||
StatusCode: http.StatusServiceUnavailable,
|
||||
Status: "Service Unavailable",
|
||||
ErrorMessage: "model is overloaded",
|
||||
}
|
||||
}
|
||||
|
||||
stream := false
|
||||
w := createRequest(t, s.ChatHandler, api.ChatRequest{
|
||||
Model: "test",
|
||||
Messages: []api.Message{
|
||||
{Role: "user", Content: "Hello!"},
|
||||
},
|
||||
Stream: &stream,
|
||||
})
|
||||
|
||||
if w.Code != http.StatusServiceUnavailable {
|
||||
t.Errorf("expected status 503, got %d", w.Code)
|
||||
}
|
||||
|
||||
if diff := cmp.Diff(w.Body.String(), `{"error":"model is overloaded"}`); diff != "" {
|
||||
t.Errorf("mismatch (-got +want):\n%s", diff)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("status error streaming", func(t *testing.T) {
|
||||
mock.CompletionFn = func(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
|
||||
return api.StatusError{
|
||||
StatusCode: http.StatusTooManyRequests,
|
||||
Status: "Too Many Requests",
|
||||
ErrorMessage: "rate limit exceeded",
|
||||
}
|
||||
}
|
||||
|
||||
w := createRequest(t, s.ChatHandler, api.ChatRequest{
|
||||
Model: "test",
|
||||
Messages: []api.Message{
|
||||
{Role: "user", Content: "Hello!"},
|
||||
},
|
||||
})
|
||||
|
||||
if w.Code != http.StatusTooManyRequests {
|
||||
t.Errorf("expected status 429, got %d", w.Code)
|
||||
}
|
||||
|
||||
if diff := cmp.Diff(w.Body.String(), `{"error":"rate limit exceeded"}`); diff != "" {
|
||||
t.Errorf("mismatch (-got +want):\n%s", diff)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestGenerate(t *testing.T) {
|
||||
@@ -983,6 +1035,55 @@ func TestGenerate(t *testing.T) {
|
||||
t.Errorf("mismatch (-got +want):\n%s", diff)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("status error non-streaming", func(t *testing.T) {
|
||||
mock.CompletionFn = func(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
|
||||
return api.StatusError{
|
||||
StatusCode: http.StatusServiceUnavailable,
|
||||
Status: "Service Unavailable",
|
||||
ErrorMessage: "model is overloaded",
|
||||
}
|
||||
}
|
||||
|
||||
streamRequest := false
|
||||
w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
|
||||
Model: "test",
|
||||
Prompt: "Hello!",
|
||||
Stream: &streamRequest,
|
||||
})
|
||||
|
||||
if w.Code != http.StatusServiceUnavailable {
|
||||
t.Errorf("expected status 503, got %d", w.Code)
|
||||
}
|
||||
|
||||
if diff := cmp.Diff(w.Body.String(), `{"error":"model is overloaded"}`); diff != "" {
|
||||
t.Errorf("mismatch (-got +want):\n%s", diff)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("status error streaming", func(t *testing.T) {
|
||||
mock.CompletionFn = func(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
|
||||
return api.StatusError{
|
||||
StatusCode: http.StatusTooManyRequests,
|
||||
Status: "Too Many Requests",
|
||||
ErrorMessage: "rate limit exceeded",
|
||||
}
|
||||
}
|
||||
|
||||
w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
|
||||
Model: "test",
|
||||
Prompt: "Hello!",
|
||||
Stream: &stream,
|
||||
})
|
||||
|
||||
if w.Code != http.StatusTooManyRequests {
|
||||
t.Errorf("expected status 429, got %d", w.Code)
|
||||
}
|
||||
|
||||
if diff := cmp.Diff(w.Body.String(), `{"error":"rate limit exceeded"}`); diff != "" {
|
||||
t.Errorf("mismatch (-got +want):\n%s", diff)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestChatWithPromptEndingInThinkTag(t *testing.T) {
|
||||
|
||||
Reference in New Issue
Block a user