diff --git a/llm/patches/05-clip-fix.diff b/llm/patches/05-clip-fix.diff new file mode 100644 index 000000000..3f68a5bb6 --- /dev/null +++ b/llm/patches/05-clip-fix.diff @@ -0,0 +1,24 @@ +diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp +index e3c9bcd4..b43f892d 100644 +--- a/examples/llava/clip.cpp ++++ b/examples/llava/clip.cpp +@@ -573,14 +573,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 + struct ggml_tensor * embeddings = inp; + if (ctx->has_class_embedding) { + embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); ++ } ++ ggml_set_name(embeddings, "embeddings"); ++ ggml_set_input(embeddings); ++ ++ if (ctx->has_class_embedding) { + embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); + embeddings = ggml_acc(ctx0, embeddings, inp, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); + } +- ggml_set_name(embeddings, "embeddings"); +- ggml_set_input(embeddings); +- + + struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); + ggml_set_name(positions, "positions"); diff --git a/llm/server.go b/llm/server.go index 2272ac839..44bada08b 100644 --- a/llm/server.go +++ b/llm/server.go @@ -194,8 +194,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr params = append(params, "--numa") } - // "--cont-batching", // TODO - doesn't seem to have any noticeable perf change for multiple requests numParallel := envconfig.NumParallel + + // TODO (jmorganca): multimodal models don't support parallel yet + // see https://github.com/ollama/ollama/issues/4165 + if len(projectors) > 0 { + numParallel = 1 + slog.Warn("multimodal models don't support parallel requests yet") + } + params = append(params, "--parallel", fmt.Sprintf("%d", numParallel)) for i := 0; i < len(servers); i++ {