From 8b51db204f1c8ba7d1e9b5e2de305f13182da5b0 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Fri, 7 Feb 2025 16:40:09 -0800 Subject: [PATCH] tmp --- llama/patches/0016-add-ollama-debug.patch | 33 ++++++ ml/backend/ggml/ggml.go | 20 ++-- ml/backend/ggml/ggml/.rsync-filter | 1 + ml/backend/ggml/ggml/include/ollama-debug.h | 11 ++ ml/backend/ggml/ggml/src/debug.go | 6 + ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c | 4 + ml/backend/ggml/ggml/src/ollama-debug.c | 110 +++++++++++++++++++ 7 files changed, 176 insertions(+), 9 deletions(-) create mode 100644 llama/patches/0016-add-ollama-debug.patch create mode 100644 ml/backend/ggml/ggml/include/ollama-debug.h create mode 100644 ml/backend/ggml/ggml/src/debug.go create mode 100644 ml/backend/ggml/ggml/src/ollama-debug.c diff --git a/llama/patches/0016-add-ollama-debug.patch b/llama/patches/0016-add-ollama-debug.patch new file mode 100644 index 000000000..1dc3a8754 --- /dev/null +++ b/llama/patches/0016-add-ollama-debug.patch @@ -0,0 +1,33 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Michael Yang +Date: Fri, 7 Feb 2025 16:51:55 -0800 +Subject: [PATCH] add ollama debug + +--- + ggml/src/ggml-cpu/ggml-cpu.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c +index b307d554..a23bbe98 100644 +--- a/ggml/src/ggml-cpu/ggml-cpu.c ++++ b/ggml/src/ggml-cpu/ggml-cpu.c +@@ -13,6 +13,8 @@ + #include "amx/amx.h" + #include "ggml.h" + ++#include "ollama-debug.h" ++ + #if defined(_MSC_VER) || defined(__MINGW32__) + #include // using malloc.h with MSC/MINGW + #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) +@@ -13645,6 +13647,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { + + ggml_compute_forward(¶ms, node); + ++#ifdef OLLAMA_DEBUG ++ ollama_debug(node, false); ++#endif ++ + if (state->ith == 0 && cplan->abort_callback && + cplan->abort_callback(cplan->abort_callback_data)) { + tp->abort = true; diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index a3a4bec49..4c5a8df5f 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -99,15 +99,17 @@ func New(r *os.File) (ml.Backend, error) { }), backend: C.ggml_backend_dev_init(d.d, nil), }) - case C.GGML_BACKEND_DEVICE_TYPE_GPU: - slog.Info("gpu", "device", d) - gpus = append(gpus, Context{ - ctx: C.ggml_init(C.struct_ggml_init_params{ - mem_size: C.size_t(int(C.ggml_tensor_overhead()) * (len(meta.Tensors().Items()) + 1 + int(meta.KV().BlockCount())*2)), - no_alloc: true, - }), - backend: C.ggml_backend_dev_init(d.d, nil), - }) + + C.ggml_backend_cpu_set_n_threads(cpus[len(cpus)-1].backend, C.int(1)) + // case C.GGML_BACKEND_DEVICE_TYPE_GPU: + // slog.Info("gpu", "device", d) + // gpus = append(gpus, Context{ + // ctx: C.ggml_init(C.struct_ggml_init_params{ + // mem_size: C.size_t(int(C.ggml_tensor_overhead()) * (len(meta.Tensors().Items()) + 1 + int(meta.KV().BlockCount())*2)), + // no_alloc: true, + // }), + // backend: C.ggml_backend_dev_init(d.d, nil), + // }) } } diff --git a/ml/backend/ggml/ggml/.rsync-filter b/ml/backend/ggml/ggml/.rsync-filter index c5acbe490..5ea9c2014 100644 --- a/ml/backend/ggml/ggml/.rsync-filter +++ b/ml/backend/ggml/ggml/.rsync-filter @@ -1,5 +1,6 @@ protect *.go protect *-embed.* +protect ollama-debug.* include include/ include src/ include src/CMakeLists.txt diff --git a/ml/backend/ggml/ggml/include/ollama-debug.h b/ml/backend/ggml/ggml/include/ollama-debug.h new file mode 100644 index 000000000..36a2e241a --- /dev/null +++ b/ml/backend/ggml/ggml/include/ollama-debug.h @@ -0,0 +1,11 @@ +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void ollama_debug(const struct ggml_tensor *tensor, bool verbose); + +#ifdef __cplusplus +} +#endif diff --git a/ml/backend/ggml/ggml/src/debug.go b/ml/backend/ggml/ggml/src/debug.go new file mode 100644 index 000000000..9ddb2718c --- /dev/null +++ b/ml/backend/ggml/ggml/src/debug.go @@ -0,0 +1,6 @@ +//go:build debug + +package ggml + +// #cgo CPPFLAGS: -DOLLAMA_DEBUG +import "C" diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c index b307d5542..4efc10c5b 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c @@ -13,6 +13,8 @@ #include "amx/amx.h" #include "ggml.h" +#include "ollama-debug.h" + #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) @@ -13645,6 +13647,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { ggml_compute_forward(¶ms, node); + ollama_debug(node, true); + if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { tp->abort = true; diff --git a/ml/backend/ggml/ggml/src/ollama-debug.c b/ml/backend/ggml/ggml/src/ollama-debug.c new file mode 100644 index 000000000..65839ae4e --- /dev/null +++ b/ml/backend/ggml/ggml/src/ollama-debug.c @@ -0,0 +1,110 @@ +#include + +#include "ollama-debug.h" + +static int mul(int64_t *dims, int ndims) { + int result = 1; + for (int i = 0; i < ndims; i++) { + result *= dims[i]; + } + + return result; +} + +static void repeat(char c, int n) { + for (int i = 0; i < n; i++) { + fprintf(stderr, "%c", c); + } +} + +static void print_tensor(const void *tensor, void (*cb)(const void *, int), + int shape, + int64_t *dims, int ndims, int stride, + int nitems, int pad) { + fprintf(stderr, "["); + for (int i = 0; i < dims[0]; i++) { + if (i >= nitems && i < dims[0] - nitems) { + fprintf(stderr, "... (%lld more), ", dims[0] - 2 * nitems); + int skip = dims[0] - 2 * nitems; + if (ndims > 1) { + stride += mul(dims + 1, ndims - 1) * skip; + repeat('\n', ndims - 1); + repeat(' ', shape - ndims + 1 + pad); + } + i += skip - 1; + } else if (ndims > 1) { + print_tensor(tensor, cb, shape, dims + 1, ndims - 1, stride, + nitems, pad); + stride += mul(dims + 1, ndims - 1); + if (i < dims[0] - 1) { + fprintf(stderr, ", "); + repeat('\n', ndims - 1); + repeat(' ', shape - ndims + 1 + pad); + } + } else { + cb(tensor, stride + i); + if (i < dims[0] - 1) { + fprintf(stderr, ", "); + } + } + } + fprintf(stderr, "]"); +} + +static void print_tensor_f16(const void *tensor, int i) { + fprintf(stderr, "%f", ggml_fp16_to_fp32(((const ggml_fp16_t *)tensor)[i])); +} + +static void print_tensor_f32(const void *tensor, int i) { + fprintf(stderr, "%f", ((const float *)tensor)[i]); +} + +static void print_tensor_i32(const void *tensor, int i) { + fprintf(stderr, "%d", ((const int32_t *)tensor)[i]); +} + +static void ollama_debug_tensor(const struct ggml_tensor *tensor, bool verbose, const char *prefix, int indent) { + fprintf(stderr, "%s%s %s (%s): [%lld %lld %lld %lld]\n", prefix, tensor->name, + ggml_op_name(tensor->op), ggml_type_name(tensor->type), tensor->ne[0], + tensor->ne[1], tensor->ne[2], tensor->ne[3]); + + if (!verbose) { + return; + } + + for (int i = 0; i < indent; i++) { + fprintf(stderr, " "); + } + + switch (tensor->type) { + case GGML_TYPE_F16: + print_tensor(ggml_get_data(tensor), print_tensor_f16, ggml_n_dims(tensor), + (int64_t *)tensor->ne, ggml_n_dims(tensor), 0, 3, indent); + break; + case GGML_TYPE_F32: + print_tensor(ggml_get_data(tensor), print_tensor_f32, ggml_n_dims(tensor), + (int64_t *)tensor->ne, ggml_n_dims(tensor), 0, 3, indent); + break; + case GGML_TYPE_I32: + print_tensor(ggml_get_data(tensor), print_tensor_i32, ggml_n_dims(tensor), + (int64_t *)tensor->ne, ggml_n_dims(tensor), 0, 3, indent); + break; + default: + fprintf(stderr, "\n"); + return; + } + + fprintf(stderr, "\n"); +} + +void ollama_debug(const struct ggml_tensor *tensor, bool verbose) { + ollama_debug_tensor(tensor, verbose, ">>> ", 4); + + if (tensor->src[0] != NULL) { + ollama_debug_tensor(tensor->src[0], verbose, " ?? ", 4); + } + + if (tensor->src[1] != NULL) { + ollama_debug_tensor(tensor->src[1], verbose, " ?? ", 4); + } +}