From 60f0b7db76761dcf0f803327367c08848a9e3973 Mon Sep 17 00:00:00 2001 From: Bruce MacDonald Date: Fri, 24 Jan 2025 16:51:19 -0800 Subject: [PATCH] working --- .gitignore | 4 +- ml/backend.go | 13 ++ ml/backend/ggml/ggml.go | 55 ++++- model/README.md | 169 +++++++++++++++ model/model_test/model_test.go | 91 +++++++++ model/model_test/testdata/qwen2.json | 294 +++++++++++++++++++++++++++ model/qwen2/model.go | 201 ++++++++++++++++++ runner/newrunner/runner.go | 1 + 8 files changed, 817 insertions(+), 11 deletions(-) create mode 100644 model/README.md create mode 100644 model/model_test/model_test.go create mode 100644 model/model_test/testdata/qwen2.json create mode 100644 model/qwen2/model.go diff --git a/.gitignore b/.gitignore index caa62a524..10cdf2416 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,6 @@ test_data *.crt llama/build __debug_bin* -llama/vendor \ No newline at end of file +llama/vendor +model/model_test/testdata/*/ +!model/model_test/testdata/*.* diff --git a/ml/backend.go b/ml/backend.go index b21d2441f..a70adcc2e 100644 --- a/ml/backend.go +++ b/ml/backend.go @@ -24,6 +24,15 @@ type Backend interface { NewContext() Context } +type GraphLayer struct { + Name string `json:"name"` + Shape []int64 `json:"shape"` +} + +type Graph struct { + Graph []GraphLayer `json:"graph"` +} + var backends = make(map[string]func(*os.File) (Backend, error)) func RegisterBackend(name string, f func(*os.File) (Backend, error)) { @@ -50,6 +59,10 @@ type Context interface { Forward(Tensor) Compute(Tensor) Tensor Close() error + + SetDebug(bool) + Trace(string, Tensor) + GetTrace() Graph } type Tensor interface { diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 39b769edc..91f53b903 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -222,6 +222,7 @@ func (b *Backend) NewContext() ml.Context { C.size_t(nodes), true, ), + traceGraph: ml.Graph{}, } } @@ -232,6 +233,9 @@ type Context struct { sched *C.struct_ggml_backend_sched graph *C.struct_ggml_cgraph nodes int + + debug bool + traceGraph ml.Graph } func (c *Context) Forward(t ml.Tensor) { @@ -320,6 +324,34 @@ func (c *Context) Close() error { return nil } +func (c *Context) SetDebug(debug bool) { + c.debug = debug +} + +func (c *Context) Trace(name string, t ml.Tensor) { + if !c.debug { + return + } + + shape := t.Shape() + shapeArr := make([]int64, 4) + for i := 0; i < len(shape); i++ { + shapeArr[i] = shape[i] + } + + c.traceGraph.Graph = append( + c.traceGraph.Graph, + ml.GraphLayer{ + Name: name, + Shape: shapeArr, + }, + ) +} + +func (c *Context) GetTrace() ml.Graph { + return c.traceGraph +} + type Tensor struct { t *C.struct_ggml_tensor data []byte @@ -555,16 +587,19 @@ func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDi return &Tensor{ t: C.ggml_rope_ext( - ctx.(*Context).ctx, t.t, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t, - C.int(ropeDim), - 131072, // YaRN n_ctx_train - ropeTypeNorm, // ROPE_TYPE_NORM - C.float(ropeBase), - C.float(ropeScale), - 0., // YaRN ext_factor - 1., // YaRN attn_factor - 32., // YaRN beta_fast - 1., // YaRN beta_slow + ctx.(*Context).ctx, + t.t, // a tensor + positionIDs.(*Tensor).t, // b tensor with dims [512, 1, 1, 1] + nil, // c tensor (not shown in log) + C.int(64), // n_dims: 64 + 2, // mode: 2 (ropeTypeNeox = 2) + C.int(32768), // n_ctx_orig: 32768 + C.float(1000000.0), // freq_base: 1000000.000000 + C.float(1.0), // freq_scale: 1.000000 + C.float(0.0), // ext_factor: 0.000000 + C.float(1.0), // attn_factor: 1.000000 + C.float(32.0), // beta_fast: 32.000000 + C.float(1.0), // beta_slow: 1.000000 ), } } diff --git a/model/README.md b/model/README.md new file mode 100644 index 000000000..c1979b85b --- /dev/null +++ b/model/README.md @@ -0,0 +1,169 @@ +# Ollama Models + +!! This is a work in progress document !! + +## Architecture + +```mermaid +graph TB + subgraph Models["Model Layer: LLM Implementations"] + direction TB + llama["llama/model.go"] + mllama["mllama/model.go"] + qwen["qwen2/model.go"] + qwen_vl["qwen2vl/model.go"] + pixtral["pixtral/"] + + note1["Each model implements a specific architecture + - Defines model parameters + - Handles tokenization + - Implements forward pass + - Manages model weights"] + end + + subgraph ML_Ops["Neural Network Operations"] + direction TB + nn_ops["nn/ + linear.go - Matrix operations + embedding.go - Token embeddings + normalization.go - Layer normalization + convolution.go - Conv operations"] + + backend["ml/backend.go + Hardware Abstraction Layer + - Defines tensor operations + - Manages computation graphs + - Handles memory allocation"] + + note2["Common neural net operations + used across different models + - Abstracts hardware details + - Provides unified API + - Manages computation flow"] + end + + subgraph GGML["Hardware Execution Layer"] + direction TB + ggml["ggml.go + CGO Interface + - Bridges Go and C++ + - Handles type conversion + - Manages memory between languages"] + + subgraph Hardware_Specific["Hardware-Specific Implementations"] + direction LR + cpu["ggml-cpu.h + CPU optimized ops"] + cuda["ggml-cuda.h + NVIDIA GPU ops"] + metal["ggml-metal.h + Apple GPU ops"] + vulkan["ggml-vulkan.h + Cross-platform GPU"] + opencl["ggml-opencl.h + OpenCL acceleration"] + end + + note3["GGML provides optimized + implementations for each hardware: + - Automatic dispatch + - Hardware-specific optimizations + - Memory management + - Parallel execution"] + end + + %% Connections with explanations + Models --> |"Makes high-level calls + (e.g., self-attention)"| ML_Ops + ML_Ops --> |"Translates to tensor operations + (e.g., matmul, softmax)"| GGML + GGML --> |"Executes optimized code + on target hardware"| Hardware_Specific + + %% Styling + classDef model fill:#fff,stroke:#01579b,stroke-width:2px + classDef ml fill:#fff,stroke:#e65100,stroke-width:2px + classDef hw fill:#fff,stroke:#b71c1c,stroke-width:2px + classDef note fill:#fff,stroke:#666,stroke-dasharray: 5 5 + + class llama,mllama,qwen,qwen_vl,pixtral model + class nn_ops,backend ml + class ggml,cpu,cuda,metal,vulkan,opencl hw + class note1,note2,note3 note + + %% Style subgraphs + style Models fill:#fff,stroke:#01579b,stroke-width:2px + style ML_Ops fill:#fff,stroke:#e65100,stroke-width:2px + style GGML fill:#fff,stroke:#b71c1c,stroke-width:2px + style Hardware_Specific fill:#fff,stroke:#b71c1c,stroke-width:1px +``` + +## Adding support for a new model to Ollama + +1. Clone the Ollama repo and get it running locally: https://github.com/ollama/ollama/blob/main/docs/development.md +2. Get the original model (research code) running locally. This will 99.99% of the time be a Python repository. +3. Get a dump of the graph built with Pytorch or Safetensors. Use this snippet to do so. +```python +import torch +import sys +from safetensors.torch import load_file + +def extract_graph(model_path): + if model_path.endswith('.safetensors'): + state_dict = load_file(model_path) + else: + state_dict = torch.load(model_path, weights_only=True) + + graph = [] + for name, tensor in state_dict.items(): + if isinstance(tensor, torch.Tensor): + graph.append({ + "name": name, + "shape": list(tensor.shape) + }) + + print("{") + print(' "graph": [') + for i, layer in enumerate(graph): + comma = "," if i < len(graph) - 1 else "" + print(f' {{"name": "{layer["name"]}", "shape": {layer["shape"]}}}{comma}') + print(" ]") + print("}") + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python extract.py ") + sys.exit(1) + + extract_graph(sys.argv[1]) +``` +4. Look at a previous model implementation pull request and copy the structure of the files needed. We will need: + 1. A `model/` directory + 2. A `model//model.go` file to implement the architecture and forward pass. + 3. A `model//convert.go` file to implement to conversion from pytorch/safetensors to ggml. + 4. `model//model_test.go` and `model//convert_test.go` files for testing. + 5. Modify main paths to make this new model accessible. +5. Open a draft pull request in the `ollama/ollama` repo, as a place to ask questions and get answers from Ollama maintainers. +6. Implement conversion from the model weights (pytorch, safetensors) to ggml in the `model//convert.go` file. Reference other `convert.go` files. +7. Create a Modelfile that only references the pytorch/safetensor directory. We will handle the other fields later. +Modelfile: +``` +FROM /path/to/model +``` +Use `ollama create` to convert the model: +`go run . create -f /path/to/Modelfie` +6. Implement the `New()` and `Forward()` logic in `model//model.go` . Reference other `model.go` files. + +Run the model and get the debug output of the forward pass to compare with the output of the research implementation from step 1: +`OLLAMA_DEBUG=1 go run . run ` +7. (maybe) Implement a new tokenizer, if needed. +8. Test text generation, this step requires knowing the prompt format: +`go run . run "hello"` +9. Add tests to `model//model_test.go` and `model//convert_test.go` +10. Push changes to `ollama/ollama` pull request, and move the pull request out of the draft state. +11. Push model to ollama.com: + 1. Find model prompt format and convert it to a Go template. + 2. Create a Modelfile `FROM` the converted gguf, add the `TEMPLATE`, `LICENSE`, and parameters if needed. + 3. `ollama create / -f /path/to/Modelfile` + 4. `ollama push /` +12. Run end-to-end integration tests. diff --git a/model/model_test/model_test.go b/model/model_test/model_test.go new file mode 100644 index 000000000..fa2dff848 --- /dev/null +++ b/model/model_test/model_test.go @@ -0,0 +1,91 @@ +package modeltest + +import ( + "encoding/json" + "os" + "path/filepath" + "reflect" + "testing" + + "github.com/ollama/ollama/cache" + "github.com/ollama/ollama/convert" + "github.com/ollama/ollama/ml" + "github.com/ollama/ollama/model" + _ "github.com/ollama/ollama/model/qwen2" +) + +func TestForward(t *testing.T) { + cases := []string{ + "qwen2", + // Add more model architectures here... + } + + for _, tt := range cases { + t.Run(tt, func(t *testing.T) { + t.Parallel() + + p := filepath.Join("testdata", tt) + if testing.Short() { + t.Skip("skipping in short mode") + } else if _, err := os.Stat(p); err != nil { + t.Skipf("%s not found", p) + } + + f, err := os.CreateTemp(t.TempDir(), "f16") + if err != nil { + t.Fatal(err) + } + defer func() { + f.Close() + os.Remove(f.Name()) + }() + + if err := convert.ConvertModel(os.DirFS(p), f); err != nil { + t.Fatal(err) + } + + m, err := model.New(f.Name()) + if err != nil { + t.Fatal(err) + } + b := m.Backend() + ctx := b.NewContext() + ctx.SetDebug(true) + + // Run forward pass + _, err = model.Forward(ctx, m, model.WithCache(cache.NewCausalCache(m.Backend(), 2048, ml.DTypeF32))) + if err != nil { + t.Fatal(err) + } + + // Validate the graph layers + data, err := os.ReadFile(filepath.Join("testdata", tt+".json")) + if err != nil { + t.Fatal(err) + } + var expected ml.Graph + if err := json.Unmarshal(data, &expected); err != nil { + t.Fatal(err) + } + + result := ctx.GetTrace() + + if len(result.Graph) != len(expected.Graph) { + t.Errorf("expected %d layers, got %d", len(expected.Graph), len(result.Graph)) + } + + for i, layer := range expected.Graph { + if i >= len(result.Graph) { + break + } + actual := result.Graph[i] + if layer.Name != actual.Name { + t.Errorf("layer %d: expected name %s, got %s", i, layer.Name, actual.Name) + } + if !reflect.DeepEqual(layer.Shape, actual.Shape) { + t.Errorf("layer %d: expected shape %v, got %v", i, layer.Shape, actual.Shape) + } + } + }) + } +} diff --git a/model/model_test/testdata/qwen2.json b/model/model_test/testdata/qwen2.json new file mode 100644 index 000000000..a2868ce10 --- /dev/null +++ b/model/model_test/testdata/qwen2.json @@ -0,0 +1,294 @@ +{ + "graph": [ + {"name": "model.embed_tokens.weight", "shape": [151936, 896]}, + {"name": "model.layers.0.input_layernorm.weight", "shape": [896]}, + {"name": "model.layers.0.mlp.down_proj.weight", "shape": [896, 4864]}, + {"name": "model.layers.0.mlp.gate_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.0.mlp.up_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.0.post_attention_layernorm.weight", "shape": [896]}, + {"name": "model.layers.0.self_attn.k_proj.bias", "shape": [128]}, + {"name": "model.layers.0.self_attn.k_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.0.self_attn.o_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.0.self_attn.q_proj.bias", "shape": [896]}, + {"name": "model.layers.0.self_attn.q_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.0.self_attn.v_proj.bias", "shape": [128]}, + {"name": "model.layers.0.self_attn.v_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.1.input_layernorm.weight", "shape": [896]}, + {"name": "model.layers.1.mlp.down_proj.weight", "shape": [896, 4864]}, + {"name": "model.layers.1.mlp.gate_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.1.mlp.up_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.1.post_attention_layernorm.weight", "shape": [896]}, + {"name": "model.layers.1.self_attn.k_proj.bias", "shape": [128]}, + {"name": "model.layers.1.self_attn.k_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.1.self_attn.o_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.1.self_attn.q_proj.bias", "shape": [896]}, + {"name": "model.layers.1.self_attn.q_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.1.self_attn.v_proj.bias", "shape": [128]}, + {"name": "model.layers.1.self_attn.v_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.10.input_layernorm.weight", "shape": [896]}, + {"name": "model.layers.10.mlp.down_proj.weight", "shape": [896, 4864]}, + {"name": "model.layers.10.mlp.gate_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.10.mlp.up_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.10.post_attention_layernorm.weight", "shape": [896]}, + {"name": "model.layers.10.self_attn.k_proj.bias", "shape": [128]}, + {"name": "model.layers.10.self_attn.k_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.10.self_attn.o_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.10.self_attn.q_proj.bias", "shape": [896]}, + {"name": "model.layers.10.self_attn.q_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.10.self_attn.v_proj.bias", "shape": [128]}, + {"name": "model.layers.10.self_attn.v_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.11.input_layernorm.weight", "shape": [896]}, + {"name": "model.layers.11.mlp.down_proj.weight", "shape": [896, 4864]}, + {"name": "model.layers.11.mlp.gate_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.11.mlp.up_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.11.post_attention_layernorm.weight", "shape": [896]}, + {"name": "model.layers.11.self_attn.k_proj.bias", "shape": [128]}, + {"name": "model.layers.11.self_attn.k_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.11.self_attn.o_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.11.self_attn.q_proj.bias", "shape": [896]}, + {"name": "model.layers.11.self_attn.q_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.11.self_attn.v_proj.bias", "shape": [128]}, + {"name": "model.layers.11.self_attn.v_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.12.input_layernorm.weight", "shape": [896]}, + {"name": "model.layers.12.mlp.down_proj.weight", "shape": [896, 4864]}, + {"name": "model.layers.12.mlp.gate_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.12.mlp.up_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.12.post_attention_layernorm.weight", "shape": [896]}, + {"name": "model.layers.12.self_attn.k_proj.bias", "shape": [128]}, + {"name": "model.layers.12.self_attn.k_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.12.self_attn.o_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.12.self_attn.q_proj.bias", "shape": [896]}, + {"name": "model.layers.12.self_attn.q_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.12.self_attn.v_proj.bias", "shape": [128]}, + {"name": "model.layers.12.self_attn.v_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.13.input_layernorm.weight", "shape": [896]}, + {"name": "model.layers.13.mlp.down_proj.weight", "shape": [896, 4864]}, + {"name": "model.layers.13.mlp.gate_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.13.mlp.up_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.13.post_attention_layernorm.weight", "shape": [896]}, + {"name": "model.layers.13.self_attn.k_proj.bias", "shape": [128]}, + {"name": "model.layers.13.self_attn.k_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.13.self_attn.o_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.13.self_attn.q_proj.bias", "shape": [896]}, + {"name": "model.layers.13.self_attn.q_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.13.self_attn.v_proj.bias", "shape": [128]}, + {"name": "model.layers.13.self_attn.v_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.14.input_layernorm.weight", "shape": [896]}, + {"name": "model.layers.14.mlp.down_proj.weight", "shape": [896, 4864]}, + {"name": "model.layers.14.mlp.gate_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.14.mlp.up_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.14.post_attention_layernorm.weight", "shape": [896]}, + {"name": "model.layers.14.self_attn.k_proj.bias", "shape": [128]}, + {"name": "model.layers.14.self_attn.k_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.14.self_attn.o_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.14.self_attn.q_proj.bias", "shape": [896]}, + {"name": "model.layers.14.self_attn.q_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.14.self_attn.v_proj.bias", "shape": [128]}, + {"name": "model.layers.14.self_attn.v_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.15.input_layernorm.weight", "shape": [896]}, + {"name": "model.layers.15.mlp.down_proj.weight", "shape": [896, 4864]}, + {"name": "model.layers.15.mlp.gate_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.15.mlp.up_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.15.post_attention_layernorm.weight", "shape": [896]}, + {"name": "model.layers.15.self_attn.k_proj.bias", "shape": [128]}, + {"name": "model.layers.15.self_attn.k_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.15.self_attn.o_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.15.self_attn.q_proj.bias", "shape": [896]}, + {"name": "model.layers.15.self_attn.q_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.15.self_attn.v_proj.bias", "shape": [128]}, + {"name": "model.layers.15.self_attn.v_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.16.input_layernorm.weight", "shape": [896]}, + {"name": "model.layers.16.mlp.down_proj.weight", "shape": [896, 4864]}, + {"name": "model.layers.16.mlp.gate_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.16.mlp.up_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.16.post_attention_layernorm.weight", "shape": [896]}, + {"name": "model.layers.16.self_attn.k_proj.bias", "shape": [128]}, + {"name": "model.layers.16.self_attn.k_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.16.self_attn.o_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.16.self_attn.q_proj.bias", "shape": [896]}, + {"name": "model.layers.16.self_attn.q_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.16.self_attn.v_proj.bias", "shape": [128]}, + {"name": "model.layers.16.self_attn.v_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.17.input_layernorm.weight", "shape": [896]}, + {"name": "model.layers.17.mlp.down_proj.weight", "shape": [896, 4864]}, + {"name": "model.layers.17.mlp.gate_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.17.mlp.up_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.17.post_attention_layernorm.weight", "shape": [896]}, + {"name": "model.layers.17.self_attn.k_proj.bias", "shape": [128]}, + {"name": "model.layers.17.self_attn.k_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.17.self_attn.o_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.17.self_attn.q_proj.bias", "shape": [896]}, + {"name": "model.layers.17.self_attn.q_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.17.self_attn.v_proj.bias", "shape": [128]}, + {"name": "model.layers.17.self_attn.v_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.18.input_layernorm.weight", "shape": [896]}, + {"name": "model.layers.18.mlp.down_proj.weight", "shape": [896, 4864]}, + {"name": "model.layers.18.mlp.gate_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.18.mlp.up_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.18.post_attention_layernorm.weight", "shape": [896]}, + {"name": "model.layers.18.self_attn.k_proj.bias", "shape": [128]}, + {"name": "model.layers.18.self_attn.k_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.18.self_attn.o_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.18.self_attn.q_proj.bias", "shape": [896]}, + {"name": "model.layers.18.self_attn.q_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.18.self_attn.v_proj.bias", "shape": [128]}, + {"name": "model.layers.18.self_attn.v_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.19.input_layernorm.weight", "shape": [896]}, + {"name": "model.layers.19.mlp.down_proj.weight", "shape": [896, 4864]}, + {"name": "model.layers.19.mlp.gate_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.19.mlp.up_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.19.post_attention_layernorm.weight", "shape": [896]}, + {"name": "model.layers.19.self_attn.k_proj.bias", "shape": [128]}, + {"name": "model.layers.19.self_attn.k_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.19.self_attn.o_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.19.self_attn.q_proj.bias", "shape": [896]}, + {"name": "model.layers.19.self_attn.q_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.19.self_attn.v_proj.bias", "shape": [128]}, + {"name": "model.layers.19.self_attn.v_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.2.input_layernorm.weight", "shape": [896]}, + {"name": "model.layers.2.mlp.down_proj.weight", "shape": [896, 4864]}, + {"name": "model.layers.2.mlp.gate_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.2.mlp.up_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.2.post_attention_layernorm.weight", "shape": [896]}, + {"name": "model.layers.2.self_attn.k_proj.bias", "shape": [128]}, + {"name": "model.layers.2.self_attn.k_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.2.self_attn.o_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.2.self_attn.q_proj.bias", "shape": [896]}, + {"name": "model.layers.2.self_attn.q_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.2.self_attn.v_proj.bias", "shape": [128]}, + {"name": "model.layers.2.self_attn.v_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.20.input_layernorm.weight", "shape": [896]}, + {"name": "model.layers.20.mlp.down_proj.weight", "shape": [896, 4864]}, + {"name": "model.layers.20.mlp.gate_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.20.mlp.up_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.20.post_attention_layernorm.weight", "shape": [896]}, + {"name": "model.layers.20.self_attn.k_proj.bias", "shape": [128]}, + {"name": "model.layers.20.self_attn.k_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.20.self_attn.o_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.20.self_attn.q_proj.bias", "shape": [896]}, + {"name": "model.layers.20.self_attn.q_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.20.self_attn.v_proj.bias", "shape": [128]}, + {"name": "model.layers.20.self_attn.v_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.21.input_layernorm.weight", "shape": [896]}, + {"name": "model.layers.21.mlp.down_proj.weight", "shape": [896, 4864]}, + {"name": "model.layers.21.mlp.gate_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.21.mlp.up_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.21.post_attention_layernorm.weight", "shape": [896]}, + {"name": "model.layers.21.self_attn.k_proj.bias", "shape": [128]}, + {"name": "model.layers.21.self_attn.k_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.21.self_attn.o_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.21.self_attn.q_proj.bias", "shape": [896]}, + {"name": "model.layers.21.self_attn.q_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.21.self_attn.v_proj.bias", "shape": [128]}, + {"name": "model.layers.21.self_attn.v_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.22.input_layernorm.weight", "shape": [896]}, + {"name": "model.layers.22.mlp.down_proj.weight", "shape": [896, 4864]}, + {"name": "model.layers.22.mlp.gate_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.22.mlp.up_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.22.post_attention_layernorm.weight", "shape": [896]}, + {"name": "model.layers.22.self_attn.k_proj.bias", "shape": [128]}, + {"name": "model.layers.22.self_attn.k_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.22.self_attn.o_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.22.self_attn.q_proj.bias", "shape": [896]}, + {"name": "model.layers.22.self_attn.q_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.22.self_attn.v_proj.bias", "shape": [128]}, + {"name": "model.layers.22.self_attn.v_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.23.input_layernorm.weight", "shape": [896]}, + {"name": "model.layers.23.mlp.down_proj.weight", "shape": [896, 4864]}, + {"name": "model.layers.23.mlp.gate_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.23.mlp.up_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.23.post_attention_layernorm.weight", "shape": [896]}, + {"name": "model.layers.23.self_attn.k_proj.bias", "shape": [128]}, + {"name": "model.layers.23.self_attn.k_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.23.self_attn.o_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.23.self_attn.q_proj.bias", "shape": [896]}, + {"name": "model.layers.23.self_attn.q_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.23.self_attn.v_proj.bias", "shape": [128]}, + {"name": "model.layers.23.self_attn.v_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.3.input_layernorm.weight", "shape": [896]}, + {"name": "model.layers.3.mlp.down_proj.weight", "shape": [896, 4864]}, + {"name": "model.layers.3.mlp.gate_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.3.mlp.up_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.3.post_attention_layernorm.weight", "shape": [896]}, + {"name": "model.layers.3.self_attn.k_proj.bias", "shape": [128]}, + {"name": "model.layers.3.self_attn.k_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.3.self_attn.o_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.3.self_attn.q_proj.bias", "shape": [896]}, + {"name": "model.layers.3.self_attn.q_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.3.self_attn.v_proj.bias", "shape": [128]}, + {"name": "model.layers.3.self_attn.v_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.4.input_layernorm.weight", "shape": [896]}, + {"name": "model.layers.4.mlp.down_proj.weight", "shape": [896, 4864]}, + {"name": "model.layers.4.mlp.gate_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.4.mlp.up_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.4.post_attention_layernorm.weight", "shape": [896]}, + {"name": "model.layers.4.self_attn.k_proj.bias", "shape": [128]}, + {"name": "model.layers.4.self_attn.k_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.4.self_attn.o_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.4.self_attn.q_proj.bias", "shape": [896]}, + {"name": "model.layers.4.self_attn.q_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.4.self_attn.v_proj.bias", "shape": [128]}, + {"name": "model.layers.4.self_attn.v_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.5.input_layernorm.weight", "shape": [896]}, + {"name": "model.layers.5.mlp.down_proj.weight", "shape": [896, 4864]}, + {"name": "model.layers.5.mlp.gate_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.5.mlp.up_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.5.post_attention_layernorm.weight", "shape": [896]}, + {"name": "model.layers.5.self_attn.k_proj.bias", "shape": [128]}, + {"name": "model.layers.5.self_attn.k_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.5.self_attn.o_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.5.self_attn.q_proj.bias", "shape": [896]}, + {"name": "model.layers.5.self_attn.q_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.5.self_attn.v_proj.bias", "shape": [128]}, + {"name": "model.layers.5.self_attn.v_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.6.input_layernorm.weight", "shape": [896]}, + {"name": "model.layers.6.mlp.down_proj.weight", "shape": [896, 4864]}, + {"name": "model.layers.6.mlp.gate_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.6.mlp.up_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.6.post_attention_layernorm.weight", "shape": [896]}, + {"name": "model.layers.6.self_attn.k_proj.bias", "shape": [128]}, + {"name": "model.layers.6.self_attn.k_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.6.self_attn.o_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.6.self_attn.q_proj.bias", "shape": [896]}, + {"name": "model.layers.6.self_attn.q_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.6.self_attn.v_proj.bias", "shape": [128]}, + {"name": "model.layers.6.self_attn.v_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.7.input_layernorm.weight", "shape": [896]}, + {"name": "model.layers.7.mlp.down_proj.weight", "shape": [896, 4864]}, + {"name": "model.layers.7.mlp.gate_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.7.mlp.up_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.7.post_attention_layernorm.weight", "shape": [896]}, + {"name": "model.layers.7.self_attn.k_proj.bias", "shape": [128]}, + {"name": "model.layers.7.self_attn.k_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.7.self_attn.o_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.7.self_attn.q_proj.bias", "shape": [896]}, + {"name": "model.layers.7.self_attn.q_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.7.self_attn.v_proj.bias", "shape": [128]}, + {"name": "model.layers.7.self_attn.v_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.8.input_layernorm.weight", "shape": [896]}, + {"name": "model.layers.8.mlp.down_proj.weight", "shape": [896, 4864]}, + {"name": "model.layers.8.mlp.gate_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.8.mlp.up_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.8.post_attention_layernorm.weight", "shape": [896]}, + {"name": "model.layers.8.self_attn.k_proj.bias", "shape": [128]}, + {"name": "model.layers.8.self_attn.k_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.8.self_attn.o_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.8.self_attn.q_proj.bias", "shape": [896]}, + {"name": "model.layers.8.self_attn.q_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.8.self_attn.v_proj.bias", "shape": [128]}, + {"name": "model.layers.8.self_attn.v_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.9.input_layernorm.weight", "shape": [896]}, + {"name": "model.layers.9.mlp.down_proj.weight", "shape": [896, 4864]}, + {"name": "model.layers.9.mlp.gate_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.9.mlp.up_proj.weight", "shape": [4864, 896]}, + {"name": "model.layers.9.post_attention_layernorm.weight", "shape": [896]}, + {"name": "model.layers.9.self_attn.k_proj.bias", "shape": [128]}, + {"name": "model.layers.9.self_attn.k_proj.weight", "shape": [128, 896]}, + {"name": "model.layers.9.self_attn.o_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.9.self_attn.q_proj.bias", "shape": [896]}, + {"name": "model.layers.9.self_attn.q_proj.weight", "shape": [896, 896]}, + {"name": "model.layers.9.self_attn.v_proj.bias", "shape": [128]}, + {"name": "model.layers.9.self_attn.v_proj.weight", "shape": [128, 896]}, + {"name": "model.norm.weight", "shape": [896]} + ] +} diff --git a/model/qwen2/model.go b/model/qwen2/model.go new file mode 100644 index 000000000..44c3cc693 --- /dev/null +++ b/model/qwen2/model.go @@ -0,0 +1,201 @@ +package qwen2 + +import ( + "fmt" + "log/slog" + "math" + + "github.com/ollama/ollama/cache" + "github.com/ollama/ollama/ml" + "github.com/ollama/ollama/ml/nn" + "github.com/ollama/ollama/model" +) + +type Options struct { + hiddenSize, numHeads, numKVHeads int64 + eps, ropeBase, ropeScale float32 + ropeDim uint32 +} + +type Model struct { + model.Base + model.BytePairEncoding + + TokenEmbedding *nn.Embedding `gguf:"token_embd"` + Layers []Layer `gguf:"blk"` + OutputNorm *nn.RMSNorm `gguf:"output_norm"` + Output *nn.Linear `gguf:"output,alt:token_embd"` + + *Options +} + +func New(c ml.Config) (model.Model, error) { + m := &Model{ + BytePairEncoding: model.BytePairEncoding{ + Pretokenizer: c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`), + Vocabulary: &model.Vocabulary{ + Values: c.Strings("tokenizer.ggml.tokens"), + Types: c.Uints("tokenizer.ggml.token_type"), + Merges: c.Strings("tokenizer.ggml.merges"), + BOS: c.Uint("tokenizer.ggml.bos_token_id"), + EOS: c.Uint("tokenizer.ggml.eos_token_id"), + }, + }, + Layers: make([]Layer, c.Uint("block_count")), + Options: &Options{ + hiddenSize: int64(c.Uint("embedding_length")), + numHeads: int64(c.Uint("attention.head_count")), + numKVHeads: int64(c.Uint("attention.head_count_kv")), + eps: c.Float("attention.layer_norm_rms_epsilon"), + ropeBase: c.Float("rope.freq_base"), + ropeScale: c.Float("rope.freq_scale", 1), + ropeDim: c.Uint("rope.dimension_count", 64), + }, + } + + slog.Debug("model configuration", + "arch", "qwen2", + "vocab_size", len(c.Strings("tokenizer.ggml.tokens")), + "n_merges", len(c.Strings("tokenizer.ggml.merges")), + "n_ctx_train", c.Uint("context_length"), + "n_embd", m.hiddenSize, + "n_layer", len(m.Layers), + "n_head", m.numHeads, + "n_head_kv", m.numKVHeads, + "n_rot", m.ropeDim, + "f_norm_rms_eps", m.eps, + "rope_freq_base", m.ropeBase, + "rope_freq_scale", m.ropeScale, + "bos_token_id", c.Uint("tokenizer.ggml.bos_token_id"), + "eos_token_id", c.Uint("tokenizer.ggml.eos_token_id"), + ) + + return m, nil +} + +type SelfAttention struct { + Query *nn.Linear `gguf:"attn_q"` + Key *nn.Linear `gguf:"attn_k"` + Value *nn.Linear `gguf:"attn_v"` + Output *nn.Linear `gguf:"attn_output"` +} + +func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, inputPositions ml.Tensor, layerIdx int, cache cache.Cache, opts *Options) ml.Tensor { + batchSize := hiddenState.Dim(1) + headDim := opts.hiddenSize / opts.numHeads + + q := sa.Query.Forward(ctx, hiddenState) + ctx.Trace(fmt.Sprintf("model.layers.%d.self_attn.q_proj", layerIdx), q) + + q = q.Reshape(ctx, headDim, opts.numHeads, batchSize) + q = q.RoPE(ctx, inputPositions, nil, opts.ropeDim, opts.ropeBase, opts.ropeScale) + ctx.Trace(fmt.Sprintf("model.layers.%d.self_attn.q_proj.rope", layerIdx), q) + + k := sa.Key.Forward(ctx, hiddenState) + k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize) + k = k.RoPE(ctx, inputPositions, nil, opts.ropeDim, opts.ropeBase, opts.ropeScale) + ctx.Trace(fmt.Sprintf("model.layers.%d.self_attn.k_proj.rope", layerIdx), k) + + v := sa.Value.Forward(ctx, hiddenState) + v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize) + ctx.Trace(fmt.Sprintf("model.layers.%d.self_attn.v_proj", layerIdx), v) + + k, v, mask := cache.Put(ctx, k, v) + + q = q.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx) + k = k.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx) + v = v.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx) + + kq := k.Mulmat(ctx, q) + kq = kq.Scale(ctx, 1.0/math.Sqrt(float64(headDim))) + kq = kq.Add(ctx, mask) + kq = kq.Softmax(ctx) + + kqv := v.Mulmat(ctx, kq) + kqv = kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx) + kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize) + + output := sa.Output.Forward(ctx, kqv) + return output +} + +type MLP struct { + Up *nn.Linear `gguf:"ffn_up"` + Down *nn.Linear `gguf:"ffn_down"` + Gate *nn.Linear `gguf:"ffn_gate"` +} + +func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor { + hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState)) + return mlp.Down.Forward(ctx, hiddenState) +} + +type Layer struct { + AttentionNorm *nn.RMSNorm `gguf:"attn_norm"` + SelfAttention *SelfAttention + MLPNorm *nn.RMSNorm `gguf:"ffn_norm"` + MLP *MLP +} + +func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, layerIdx int, cache cache.Cache, opts *Options) ml.Tensor { + ctx.Trace(fmt.Sprintf("model.layers.%d.input", layerIdx), hiddenState) + residual := hiddenState + + hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps) + ctx.Trace(fmt.Sprintf("model.layers.%d.input_layernorm", layerIdx), hiddenState) + + hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, layerIdx, cache, opts) + ctx.Trace(fmt.Sprintf("model.layers.%d.self_attn.output", layerIdx), hiddenState) + + hiddenState = hiddenState.Add(ctx, residual) + residual = hiddenState + ctx.Trace(fmt.Sprintf("model.layers.%d.self_attn.residual", layerIdx), hiddenState) + + hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps) + ctx.Trace(fmt.Sprintf("model.layers.%d.post_attention_layernorm", layerIdx), hiddenState) + + hiddenState = l.MLP.Forward(ctx, hiddenState, opts) + ctx.Trace(fmt.Sprintf("model.layers.%d.mlp", layerIdx), hiddenState) + + output := hiddenState.Add(ctx, residual) + ctx.Trace(fmt.Sprintf("model.layers.%d.output", layerIdx), output) + + return output +} + +func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) { + slog.Debug("input tokens", "input_ids", opts.Inputs()) + inputs, err := ctx.FromIntSlice(opts.Inputs(), len(opts.Inputs())) + if err != nil { + return nil, err + } + + positions, err := ctx.FromIntSlice(opts.Positions(), len(opts.Positions())) + if err != nil { + return nil, err + } + + hiddenState := m.TokenEmbedding.Forward(ctx, inputs) + ctx.Trace("model.embed_tokens", hiddenState) + + for i, layer := range m.Layers { + hiddenState = layer.Forward(ctx, hiddenState, positions, i, opts.Cache.Sub(i), m.Options) + } + + hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps) + ctx.Trace("model.norm", hiddenState) + + hiddenState = m.Output.Forward(ctx, hiddenState) + ctx.Trace("model.output", hiddenState) + + outputs, err := ctx.FromIntSlice(opts.Outputs(), len(opts.Outputs())) + if err != nil { + return nil, err + } + + return hiddenState.Rows(ctx, outputs), nil +} + +func init() { + model.Register("qwen2", New) +} diff --git a/runner/newrunner/runner.go b/runner/newrunner/runner.go index 742c45b6e..844cb222a 100644 --- a/runner/newrunner/runner.go +++ b/runner/newrunner/runner.go @@ -32,6 +32,7 @@ import ( _ "github.com/ollama/ollama/model/llama" _ "github.com/ollama/ollama/model/mllama" + _ "github.com/ollama/ollama/model/qwen2" ) // input is an element of the prompt to process, either