From 9e4642e9b3e9a26d423c62915805375ca253d7d1 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Sun, 9 Mar 2025 14:45:36 -0700 Subject: [PATCH] ollama debug tensor --- llama/patches/0020-ollama-debug-tensor.patch | 33 +++++ ml/backend/ggml/ggml.go | 2 +- ml/backend/ggml/ggml/include/ollama-debug.h | 11 ++ .../ggml/ggml/src/ggml-cpu/cpu_debug.go | 6 + ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c | 6 + ml/backend/ggml/ggml/src/ollama-debug.c | 115 ++++++++++++++++++ ml/backend/ggml/threads.go | 7 ++ ml/backend/ggml/threads_debug.go | 7 ++ 8 files changed, 186 insertions(+), 1 deletion(-) create mode 100644 llama/patches/0020-ollama-debug-tensor.patch create mode 100644 ml/backend/ggml/ggml/include/ollama-debug.h create mode 100644 ml/backend/ggml/ggml/src/ggml-cpu/cpu_debug.go create mode 100644 ml/backend/ggml/ggml/src/ollama-debug.c create mode 100644 ml/backend/ggml/threads.go create mode 100644 ml/backend/ggml/threads_debug.go diff --git a/llama/patches/0020-ollama-debug-tensor.patch b/llama/patches/0020-ollama-debug-tensor.patch new file mode 100644 index 000000000..b9f2e4ab0 --- /dev/null +++ b/llama/patches/0020-ollama-debug-tensor.patch @@ -0,0 +1,33 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Michael Yang +Date: Sun, 9 Mar 2025 14:44:16 -0700 +Subject: [PATCH] ollama debug tensor + +--- + ggml/src/ggml-cpu/ggml-cpu.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c +index 2f606d82..ec60e8fc 100644 +--- a/ggml/src/ggml-cpu/ggml-cpu.c ++++ b/ggml/src/ggml-cpu/ggml-cpu.c +@@ -11,6 +11,8 @@ + #include "ggml-threading.h" + #include "ggml.h" + ++#include "ollama-debug.h" ++ + #if defined(_MSC_VER) || defined(__MINGW32__) + #include // using malloc.h with MSC/MINGW + #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) +@@ -14103,6 +14105,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { + + ggml_compute_forward(¶ms, node); + ++#ifdef OLLAMA_DEBUG ++ ollama_debug(node, true); ++#endif ++ + if (state->ith == 0 && cplan->abort_callback && + cplan->abort_callback(cplan->abort_callback_data)) { + atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed); diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index b4efe9da4..6e123e1ff 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -355,7 +355,7 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) { if C.ggml_backend_is_cpu(b) { // set number of threads for cpu backend - C.ggml_backend_cpu_set_n_threads(b, C.int(params.NumThreads)) + C.ggml_backend_cpu_set_n_threads(b, C.int(Threads(params.NumThreads))) } } diff --git a/ml/backend/ggml/ggml/include/ollama-debug.h b/ml/backend/ggml/ggml/include/ollama-debug.h new file mode 100644 index 000000000..36a2e241a --- /dev/null +++ b/ml/backend/ggml/ggml/include/ollama-debug.h @@ -0,0 +1,11 @@ +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void ollama_debug(const struct ggml_tensor *tensor, bool verbose); + +#ifdef __cplusplus +} +#endif diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/cpu_debug.go b/ml/backend/ggml/ggml/src/ggml-cpu/cpu_debug.go new file mode 100644 index 000000000..7eab98138 --- /dev/null +++ b/ml/backend/ggml/ggml/src/ggml-cpu/cpu_debug.go @@ -0,0 +1,6 @@ +//go:build debug + +package cpu + +// #cgo CPPFLAGS: -DOLLAMA_DEBUG +import "C" diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c index 2f606d824..ec60e8fcf 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c @@ -11,6 +11,8 @@ #include "ggml-threading.h" #include "ggml.h" +#include "ollama-debug.h" + #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) @@ -14103,6 +14105,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { ggml_compute_forward(¶ms, node); +#ifdef OLLAMA_DEBUG + ollama_debug(node, true); +#endif + if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed); diff --git a/ml/backend/ggml/ggml/src/ollama-debug.c b/ml/backend/ggml/ggml/src/ollama-debug.c new file mode 100644 index 000000000..b0e9d7f08 --- /dev/null +++ b/ml/backend/ggml/ggml/src/ollama-debug.c @@ -0,0 +1,115 @@ +#include + +#include "ollama-debug.h" + +static int mul(int64_t *dims, int ndims) { + int result = 1; + for (int i = 0; i < ndims; i++) { + result *= dims[i]; + } + + return result; +} + +static void repeat(char c, int n) { + for (int i = 0; i < n; i++) { + fprintf(stderr, "%c", c); + } +} + +static void print_tensor(const void *tensor, void (*cb)(const void *, int), + int shape, + int64_t *dims, int ndims, int stride, + int nitems, int pad) { + fprintf(stderr, "["); + for (int i = 0; i < dims[0]; i++) { + if (i >= nitems && i < dims[0] - nitems) { + fprintf(stderr, "... (%lld more), ", dims[0] - 2 * nitems); + int skip = dims[0] - 2 * nitems; + if (ndims > 1) { + stride += mul(dims + 1, ndims - 1) * skip; + repeat('\n', ndims - 1); + repeat(' ', shape - ndims + 1 + pad); + } + i += skip - 1; + } else if (ndims > 1) { + print_tensor(tensor, cb, shape, dims + 1, ndims - 1, stride, + nitems, pad); + stride += mul(dims + 1, ndims - 1); + if (i < dims[0] - 1) { + fprintf(stderr, ", "); + repeat('\n', ndims - 1); + repeat(' ', shape - ndims + 1 + pad); + } + } else { + cb(tensor, stride + i); + if (i < dims[0] - 1) { + fprintf(stderr, ", "); + } + } + } + fprintf(stderr, "]"); +} + +static void print_tensor_f16(const void *tensor, int i) { + float value = ggml_fp16_to_fp32(((const ggml_fp16_t *)tensor)[i]); + fprintf(stderr, "%s%f", value < 0 ? "" : " ", value); +} + +static void print_tensor_f32(const void *tensor, int i) { + float value = ((const float *)tensor)[i]; + fprintf(stderr, "%s%f", value < 0 ? "" : " ", value); +} + +static void print_tensor_i32(const void *tensor, int i) { + int32_t value = ((const int32_t *)tensor)[i]; + fprintf(stderr, "%s%d", value < 0 ? "" : " ", value); +} + +static void ollama_debug_tensor(const struct ggml_tensor *tensor, bool verbose, const char *prefix, int indent) { + fprintf(stderr, "%s%s %s (%s): [%lld %lld %lld %lld]\n", prefix, tensor->name, + ggml_op_name(tensor->op), ggml_type_name(tensor->type), tensor->ne[0], + tensor->ne[1], tensor->ne[2], tensor->ne[3]); + + if (!verbose) { + return; + } + + for (int i = 0; i < indent; i++) { + fprintf(stderr, " "); + } + + switch (tensor->type) { + case GGML_TYPE_F16: + print_tensor(ggml_get_data(tensor), print_tensor_f16, ggml_n_dims(tensor), + (int64_t *)tensor->ne, ggml_n_dims(tensor), 0, 3, indent); + break; + case GGML_TYPE_F32: + print_tensor(ggml_get_data(tensor), print_tensor_f32, ggml_n_dims(tensor), + (int64_t *)tensor->ne, ggml_n_dims(tensor), 0, 3, indent); + break; + case GGML_TYPE_I32: + print_tensor(ggml_get_data(tensor), print_tensor_i32, ggml_n_dims(tensor), + (int64_t *)tensor->ne, ggml_n_dims(tensor), 0, 3, indent); + break; + default: + fprintf(stderr, "\n"); + return; + } + + fprintf(stderr, "\n"); +} + +void ollama_debug(const struct ggml_tensor *tensor, bool verbose) { + ollama_debug_tensor(tensor, verbose, ">>> ", 4); + + for (int i = 0; i < GGML_MAX_SRC && tensor->src[i] != NULL; ++i) { + char src[8]; + const int n = snprintf(src, sizeof(src), " src%d ", i); + if (n >= sizeof(src)) { + src[sizeof(src) - 1] = '\0'; + } + + ollama_debug_tensor(tensor->src[i], verbose, src, 4); + } +} diff --git a/ml/backend/ggml/threads.go b/ml/backend/ggml/threads.go new file mode 100644 index 000000000..cbc524f7a --- /dev/null +++ b/ml/backend/ggml/threads.go @@ -0,0 +1,7 @@ +//go:build !debug + +package ggml + +func Threads(n int) int { + return n +} diff --git a/ml/backend/ggml/threads_debug.go b/ml/backend/ggml/threads_debug.go new file mode 100644 index 000000000..cfd334bd2 --- /dev/null +++ b/ml/backend/ggml/threads_debug.go @@ -0,0 +1,7 @@ +//go:build debug + +package ggml + +func Threads(_ int) int { + return 1 +}