Revert "Update vendored llama.cpp to b7847" (#14061)

2026-04-18 09:03:35 -04:00 · 2026-02-03 18:39:36 -08:00
parent a6355329bf
commit b1fccabb34
240 changed files with 5050 additions and 21247 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -92,7 +92,7 @@ jobs:
            flags: ''
          - os: windows
            arch: amd64
-            preset: 'CUDA 13 Windows'
+            preset: 'CUDA 13'
            install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe
            cuda-components:
              - '"cudart"'
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -40,17 +40,7 @@
      "name": "CUDA 13",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "75-virtual;80-virtual;86-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;103-virtual;110-virtual;120-virtual;121-virtual",
+        "CMAKE_CUDA_ARCHITECTURES": "75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;103-virtual;110-virtual;120-virtual;121-virtual",
        "CMAKE_CUDA_FLAGS": "-t 4",
        "OLLAMA_RUNNER_DIR": "cuda_v13"
      }
    },
    {
      "name": "CUDA 13 Windows",
      "inherits": [ "CUDA" ],
      "description": "Reduced architecture set for Windows to avoid MSVC template compilation issues",
      "cacheVariables": {
        "CMAKE_CUDA_ARCHITECTURES": "75-virtual;89-virtual;100-virtual;120-virtual",
        "CMAKE_CUDA_FLAGS": "-t 4",
        "OLLAMA_RUNNER_DIR": "cuda_v13"
      }
@@ -148,11 +138,6 @@
      "inherits": [ "CUDA" ],
      "configurePreset": "CUDA 13"
    },
    {
      "name": "CUDA 13 Windows",
      "inherits": [ "CUDA" ],
      "configurePreset": "CUDA 13 Windows"
    },
    {
      "name": "JetPack 5",
      "inherits": [ "CUDA" ],
--- a/Makefile.sync
+++ b/Makefile.sync
@@ -1,6 +1,6 @@
 UPSTREAM=https://github.com/ggml-org/llama.cpp.git
 WORKDIR=llama/vendor
-FETCH_HEAD=a5bb8ba4c50257437630c136210396810741bbf7
+FETCH_HEAD=ec98e2002
 .PHONY: help
 help:
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@@ -73,18 +73,13 @@ func manhattanDistance[V float32 | float64](v1, v2 []V) V {
 }
 func TestEmbedCosineDistanceCorrelation(t *testing.T) {
-	softTimeout, hardTimeout := getTimeouts(t)
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
 	started := time.Now()
 	for _, model := range libraryEmbedModels {
 		t.Run(model, func(t *testing.T) {
 			if time.Since(started) > softTimeout {
 				t.Skip("skipping - soft timeout exceeded")
 			}
 			testCases := []struct {
 				a string
 				b string
@@ -494,19 +489,14 @@ func TestEmbedTruncation(t *testing.T) {
 // TestEmbedLargeInput tests that embedding models can handle large inputs that would exceed typical batch sizes.
 func TestEmbedLargeInput(t *testing.T) {
-	softTimeout, hardTimeout := getTimeouts(t)
+	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
 	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
 	started := time.Now()
 	for _, model := range libraryEmbedModels {
 		model := model
 		t.Run(model, func(t *testing.T) {
 			if time.Since(started) > softTimeout {
 				t.Skip("skipping - soft timeout exceeded")
 			}
 			mctx, mcancel := context.WithTimeout(ctx, 2*time.Minute)
 			defer mcancel()
--- a/integration/tools_test.go
+++ b/integration/tools_test.go
@@ -21,10 +21,9 @@ func testPropsMap(m map[string]api.ToolProperty) *api.ToolPropertiesMap {
 }
 func TestAPIToolCalling(t *testing.T) {
-	initialTimeout := 90 * time.Second
+	initialTimeout := 60 * time.Second
-	streamTimeout := 90 * time.Second
+	streamTimeout := 60 * time.Second
-	softTimeout, hardTimeout := getTimeouts(t)
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
 	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
@@ -48,12 +47,8 @@ func TestAPIToolCalling(t *testing.T) {
 		"granite3.3":    7,
 	}
 	started := time.Now()
 	for _, model := range libraryToolsModels {
 		t.Run(model, func(t *testing.T) {
 			if time.Since(started) > softTimeout {
 				t.Skip("skipping - soft timeout exceeded")
 			}
 			if v, ok := minVRAM[model]; ok {
 				skipUnderMinVRAM(t, v)
 			}
--- a/llama/README.md
+++ b/llama/README.md
@@ -14,28 +14,25 @@ make -f Makefile.sync apply-patches
 ### Updating Base Commit
-To update to a new base commit:
+**Pin to new base commit**
-1. **Update FETCH_HEAD** in `Makefile.sync` to the new commit hash.
+To change the base commit, update `FETCH_HEAD` in Makefile.sync.
-2. **Check for upstreamed patches**: Before applying, review if any patches have been merged upstream. Remove those patches from `./patches/` to avoid conflicts.
+When updating to a newer base commit, the existing patches may not apply cleanly and require manual merge resolution.
-3. **Apply patches**:
+Start by applying the patches. If any of the patches have conflicts, the `git am` will stop at the first failure.
   ```shell
   make -f Makefile.sync apply-patches
   ```
-4. **Resolve conflicts** (if any): When `git am` fails on a patch:
+```shell
-   - Fix conflicts in `./vendor/`
+make -f Makefile.sync apply-patches
-   - Stage the resolved files: `git -C llama/vendor add <file>`
+```
   - Continue: `git -C llama/vendor am --continue`
   - Re-run: `make -f Makefile.sync apply-patches`
   - Repeat until all patches are applied.
-5. **Regenerate patches and sync**:
+If there are conflicts, you will see an error message. Resolve the conflicts in `./vendor/`, and continue the patch series with `git am --continue` and rerun `make -f Makefile.sync apply-patches`. Repeat until all patches are successfully applied.
-   ```shell
+
-   make -f Makefile.sync format-patches sync
+Once all patches are applied, commit the changes to the tracking repository.
-   ```
+
 ```shell
 make -f Makefile.sync format-patches sync
 ```
 ### Generating Patches
--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@@ -1,4 +1,4 @@
 int LLAMA_BUILD_NUMBER = 0;
-char const *LLAMA_COMMIT = "a5bb8ba4c50257437630c136210396810741bbf7";
+char const *LLAMA_COMMIT = "ec98e2002";
 char const *LLAMA_COMPILER = "";
 char const *LLAMA_BUILD_TARGET = "";
--- a/llama/llama.cpp/common/common.cpp
+++ b/llama/llama.cpp/common/common.cpp
@@ -251,7 +251,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
        case GGML_SCHED_PRIO_REALTIME: p = -20; break;
    }
-    if (setpriority(PRIO_PROCESS, 0, p) != 0) {
+    if (!setpriority(PRIO_PROCESS, 0, p)) {
        LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
        return false;
    }
@@ -1078,15 +1078,12 @@ struct common_init_result::impl {
    impl() = default;
    ~impl() = default;
    // note: the order in which model, context, etc. are declared matters because their destructors will be called bottom-to-top
    llama_model_ptr   model;
    llama_context_ptr context;
    std::vector<llama_adapter_lora_ptr> lora;
    std::vector<common_sampler_ptr> samplers;
    std::vector<llama_sampler_seq_config> samplers_seq_config;
 };
 common_init_result::common_init_result(common_params & params) :
@@ -1095,9 +1092,9 @@ common_init_result::common_init_result(common_params & params) :
    auto cparams = common_context_params_to_llama(params);
    if (params.fit_params) {
-        LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
+        LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__);
        llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
-            params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
+            params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
            params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
    }
@@ -1110,25 +1107,6 @@ common_init_result::common_init_result(common_params & params) :
    const llama_vocab * vocab = llama_model_get_vocab(model);
    // load and optionally apply lora adapters (must be loaded before context creation)
    for (auto & la : params.lora_adapters) {
        llama_adapter_lora_ptr lora;
        lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
        if (lora == nullptr) {
            LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str());
            pimpl->model.reset(model);
            return;
        }
        char buf[1024];
        la.ptr = lora.get();
        llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
        la.task_name = buf;
        llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
        la.prompt_prefix = buf;
        pimpl->lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
    }
    // updates params.sampling
    // TODO: fix naming
    common_init_sampler_from_model(model, params.sampling);
@@ -1163,18 +1141,10 @@ common_init_result::common_init_result(common_params & params) :
    //    params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
    //}
    // init the backend samplers as part of the context creation
    pimpl->samplers.resize(cparams.n_seq_max);
    pimpl->samplers_seq_config.resize(cparams.n_seq_max);
    for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
        pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
        pimpl->samplers_seq_config[i] = { i, common_sampler_get(pimpl->samplers[i].get()) };
    }
    if (params.sampling.backend_sampling) {
        cparams.samplers   = pimpl->samplers_seq_config.data();
        cparams.n_samplers = pimpl->samplers_seq_config.size();
    }
    llama_context * lctx = llama_init_from_model(model, cparams);
@@ -1198,12 +1168,6 @@ common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
    return pimpl->samplers[seq_id].get();
 }
 void common_init_result::reset_samplers() {
    for (int i = 0; i < (int) pimpl->samplers.size(); ++i) {
        llama_sampler_reset(common_sampler_get(pimpl->samplers[i].get()));
    }
 }
 std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
    return pimpl->lora;
 }
@@ -1279,6 +1243,24 @@ common_init_result_ptr common_init_from_params(common_params & params) {
        }
    }
    // load and optionally apply lora adapters
    for (auto & la : params.lora_adapters) {
        llama_adapter_lora_ptr lora;
        lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
        if (lora == nullptr) {
            LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
            return res;
        }
        char buf[1024];
        la.ptr = lora.get();
        llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
        la.task_name = buf;
        llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
        la.prompt_prefix = buf;
        res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
    }
    if (!params.lora_init_without_apply) {
        common_set_adapter_lora(lctx, params.lora_adapters);
    }
@@ -1319,9 +1301,6 @@ common_init_result_ptr common_init_from_params(common_params & params) {
        llama_synchronize(lctx);
        llama_perf_context_reset(lctx);
        llama_set_warmup(lctx, false);
        // reset samplers to reset RNG state after warmup to the seeded state
        res->reset_samplers();
    }
    return res;
@@ -1360,12 +1339,14 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
        mparams.devices = params.devices.data();
    }
-    mparams.n_gpu_layers    = params.n_gpu_layers;
+    if (params.n_gpu_layers != -1) {
        mparams.n_gpu_layers = params.n_gpu_layers;
    }
    mparams.main_gpu        = params.main_gpu;
    mparams.split_mode      = params.split_mode;
    mparams.tensor_split    = params.tensor_split;
    mparams.use_mmap        = params.use_mmap;
    mparams.use_direct_io   = params.use_direct_io;
    mparams.use_mlock       = params.use_mlock;
    mparams.check_tensors   = params.check_tensors;
    mparams.use_extra_bufts = !params.no_extra_bufts;
--- a/llama/llama.cpp/common/common.h
+++ b/llama/llama.cpp/common/common.h
@@ -57,8 +57,6 @@ extern const char * LLAMA_COMMIT;
 extern const char * LLAMA_COMPILER;
 extern const char * LLAMA_BUILD_TARGET;
 const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
 struct common_control_vector_load_info;
 //
@@ -82,8 +80,6 @@ int32_t cpu_get_num_math();
 //
 enum llama_example {
    LLAMA_EXAMPLE_BATCHED,
    LLAMA_EXAMPLE_DEBUG,
    LLAMA_EXAMPLE_COMMON,
    LLAMA_EXAMPLE_SPECULATIVE,
    LLAMA_EXAMPLE_COMPLETION,
@@ -121,7 +117,6 @@ enum common_sampler_type {
    COMMON_SAMPLER_TYPE_INFILL      = 9,
    COMMON_SAMPLER_TYPE_PENALTIES   = 10,
    COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
    COMMON_SAMPLER_TYPE_ADAPTIVE_P  = 12,
 };
 // dimensionality reduction methods, used by cvector-generator
@@ -169,34 +164,32 @@ enum common_params_sampling_config : uint64_t {
 struct common_params_sampling {
    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
-    int32_t n_prev             = 64;     // number of previous tokens to remember
+    int32_t n_prev             = 64;    // number of previous tokens to remember
-    int32_t n_probs            = 0;      // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t n_probs            = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t min_keep           = 0;      // 0 = disabled, otherwise samplers should return at least min_keep tokens
+    int32_t min_keep           = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
-    int32_t top_k              = 40;     // <= 0 to use vocab size
+    int32_t top_k              = 40;    // <= 0 to use vocab size
-    float   top_p              = 0.95f;  // 1.0 = disabled
+    float   top_p              = 0.95f; // 1.0 = disabled
-    float   min_p              = 0.05f;  // 0.0 = disabled
+    float   min_p              = 0.05f; // 0.0 = disabled
-    float   xtc_probability    = 0.00f;  // 0.0 = disabled
+    float   xtc_probability    = 0.00f; // 0.0 = disabled
-    float   xtc_threshold      = 0.10f;  // > 0.5 disables XTC
+    float   xtc_threshold      = 0.10f; // > 0.5 disables XTC
-    float   typ_p              = 1.00f;  // typical_p, 1.0 = disabled
+    float   typ_p              = 1.00f; // typical_p, 1.0 = disabled
-    float   temp               = 0.80f;  // <= 0.0 to sample greedily, 0.0 to not output probabilities
+    float   temp               = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
-    float   dynatemp_range     = 0.00f;  // 0.0 = disabled
+    float   dynatemp_range     = 0.00f; // 0.0 = disabled
-    float   dynatemp_exponent  = 1.00f;  // controls how entropy maps to temperature in dynamic temperature sampler
+    float   dynatemp_exponent  = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
-    int32_t penalty_last_n     = 64;     // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    int32_t penalty_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   penalty_repeat     = 1.00f;  // 1.0 = disabled
+    float   penalty_repeat     = 1.00f; // 1.0 = disabled
-    float   penalty_freq       = 0.00f;  // 0.0 = disabled
+    float   penalty_freq       = 0.00f; // 0.0 = disabled
-    float   penalty_present    = 0.00f;  // 0.0 = disabled
+    float   penalty_present    = 0.00f; // 0.0 = disabled
-    float   dry_multiplier     = 0.0f;   // 0.0 = disabled;      DRY repetition penalty for tokens extending repetition:
+    float   dry_multiplier     = 0.0f;  // 0.0 = disabled;      DRY repetition penalty for tokens extending repetition:
-    float   dry_base           = 1.75f;  // 0.0 = disabled;      multiplier * base ^ (length of sequence before token - allowed length)
+    float   dry_base           = 1.75f; // 0.0 = disabled;      multiplier * base ^ (length of sequence before token - allowed length)
-    int32_t dry_allowed_length = 2;      // tokens extending repetitions beyond this receive penalty
+    int32_t dry_allowed_length = 2;     // tokens extending repetitions beyond this receive penalty
-    int32_t dry_penalty_last_n = -1;     // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
+    int32_t dry_penalty_last_n = -1;    // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
-    float   adaptive_target    = -1.0f;  // select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
+    int32_t mirostat           = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   adaptive_decay     = 0.90f;  // EMA decay for adaptation; history ≈ 1/(1-decay) tokens (0.0 - 0.99)
+    float   top_n_sigma        = -1.00f;// -1.0 = disabled
-    int32_t mirostat           = 0;      // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   mirostat_tau       = 5.00f; // target entropy
-    float   top_n_sigma        = -1.00f; // -1.0 = disabled
+    float   mirostat_eta       = 0.10f; // learning rate
    float   mirostat_tau       = 5.00f;  // target entropy
    float   mirostat_eta       = 0.10f;  // learning rate
    bool    ignore_eos         = false;
-    bool    no_perf            = false;  // disable performance metrics
+    bool    no_perf            = false; // disable performance metrics
    bool    timing_per_token   = false;
    uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
@@ -223,8 +216,6 @@ struct common_params_sampling {
    std::vector<llama_logit_bias> logit_bias;     // logit biases to apply
    std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
    bool backend_sampling = false;
    bool has_logit_bias() const {
        return !logit_bias.empty();
    }
@@ -286,7 +277,6 @@ struct common_params_diffusion {
 };
 // reasoning API response format (not to be confused as chat template's reasoning format)
 // only used by server
 enum common_reasoning_format {
    COMMON_REASONING_FORMAT_NONE,
    COMMON_REASONING_FORMAT_AUTO,            // Same as deepseek, using `message.reasoning_content`
@@ -339,14 +329,12 @@ struct common_params {
    // offload params
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
-    int32_t n_gpu_layers       = -1;   // number of layers to store in VRAM, -1 is auto, <= -2 is all
+    int32_t n_gpu_layers       = -1;               // number of layers to store in VRAM (-1 - use default)
-    int32_t main_gpu           = 0;    // the GPU that is used for scratch and small tensors
+    int32_t main_gpu           = 0;                // the GPU that is used for scratch and small tensors
-    float   tensor_split[128]  = {0};  // how split tensors should be distributed across GPUs
+    float   tensor_split[128]  = {0};              // how split tensors should be distributed across GPUs
-    bool    fit_params         = true; // whether to fit unset model/context parameters to free device memory
+    bool    fit_params         = true;             // whether to fit unset model/context parameters to free device memory
-    int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
+    size_t  fit_params_target  = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
-
+    int32_t fit_params_min_ctx = 4096;             // minimum context size to set when trying to reduce memory use
    // margin per device in bytes for fitting parameters to free memory:
    std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
@@ -382,11 +370,6 @@ struct common_params {
    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
    // llama-debug specific options
    std::string logits_output_dir = "data"; // directory for saving logits output files                     // NOLINT
    bool        save_logits       = false;  // whether to save logits to files                              // NOLINT
    std::vector<std::string> tensor_filter; // filter tensor names for debug output (regex)                 // NOLINT
    std::vector<std::string> in_files;   // all input files
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
    std::vector<llama_model_kv_override> kv_overrides;
@@ -437,8 +420,7 @@ struct common_params {
    bool kv_unified        = false; // enable unified KV cache
    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
-    bool use_mmap          = true;  // enable mmap to use filesystem cache
+    bool use_mmap          = true;  // use mmap for faster loads
    bool use_direct_io     = true;  // read from disk without buffering for faster model loading
    bool use_mlock         = false; // use mlock to keep model in memory
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool display_prompt    = true;  // print prompt before generation
@@ -482,7 +464,6 @@ struct common_params {
    int32_t timeout_write     = timeout_read; // http write timeout in seconds
    int32_t n_threads_http    = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
    int32_t n_cache_reuse     = 0;            // min chunk size to reuse from the cache via KV shifting
    bool    cache_prompt      = true;         // whether to enable prompt caching
    int32_t n_ctx_checkpoints = 8;            // max number of context checkpoints per slot
    int32_t cache_ram_mib     = 8192;         // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
@@ -494,8 +475,7 @@ struct common_params {
    bool enable_chat_template = true;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
    int reasoning_budget = -1;
-    bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
+    bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response
    int sleep_idle_seconds = -1;   // if >0, server will sleep after this many seconds of idle time
    std::vector<std::string> api_keys;
@@ -504,11 +484,8 @@ struct common_params {
    std::map<std::string, std::string> default_template_kwargs;
    // webui configs
    bool webui = true;
    std::string webui_config_json;
    // "advanced" endpoints are disabled by default for better security
    bool webui            = true;
    bool endpoint_slots   = true;
    bool endpoint_props   = false; // only control POST requests, not GET
    bool endpoint_metrics = false;
@@ -708,9 +685,7 @@ struct common_init_result {
    llama_model * model();
    llama_context * context();
    common_sampler * sampler(llama_seq_id seq_id);
    void reset_samplers();
    std::vector<llama_adapter_lora_ptr> & lora();
--- a/llama/llama.cpp/common/sampling.cpp
+++ b/llama/llama.cpp/common/sampling.cpp
@@ -104,9 +104,10 @@ struct ring_buffer {
 struct common_sampler {
    common_params_sampling params;
    struct llama_sampler * grmr;
    struct llama_sampler * chain;
    bool grammar;
    ring_buffer<llama_token> prev;
    std::vector<llama_token_data> cur;
@@ -120,34 +121,17 @@ struct common_sampler {
    }
    void set_logits(struct llama_context * ctx, int idx) {
-        const float *       sampled_probs  = llama_get_sampled_probs_ith     (ctx, idx);
+        const auto * logits = llama_get_logits_ith(ctx, idx);
        const float *       sampled_logits = llama_get_sampled_logits_ith    (ctx, idx);
        const llama_token * sampled_ids    = llama_get_sampled_candidates_ith(ctx, idx);
        const llama_model * model = llama_get_model(ctx);
        const llama_vocab * vocab = llama_model_get_vocab(model);
        const int n_vocab = llama_vocab_n_tokens(vocab);
-        if (sampled_probs) {
+        cur.resize(n_vocab);
-            const uint32_t sampled_probs_count = llama_get_sampled_probs_count_ith(ctx, idx);
+
-            cur.resize(sampled_probs_count);
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            for (uint32_t i = 0; i < sampled_probs_count; ++i) {
+            cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
                cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], sampled_probs[i]};
            }
        } else if (sampled_logits) {
            const uint32_t sampled_logits_count = llama_get_sampled_logits_count_ith(ctx, idx);
            cur.resize(sampled_logits_count);
            for (uint32_t i = 0; i < sampled_logits_count; i++) {
                cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f};
            }
        } else {
            const auto * logits = llama_get_logits_ith(ctx, idx);
            GGML_ASSERT(logits != nullptr);
            cur.resize(n_vocab);
            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
                cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
            }
        }
        cur_p = { cur.data(), cur.size(), -1, false };
@@ -167,59 +151,54 @@ std::string common_params_sampling::print() const {
            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
            "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
            "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
-            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f, adaptive_target = %.3f, adaptive_decay = %.3f",
+            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
            penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
            dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
            top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
-            mirostat, mirostat_eta, mirostat_tau, adaptive_target, adaptive_decay);
+            mirostat, mirostat_eta, mirostat_tau);
    return std::string(result);
 }
-struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params) {
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
    const llama_vocab * vocab = llama_model_get_vocab(model);
    llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
    lparams.no_perf = params.no_perf;
    llama_sampler * grmr = nullptr;
    llama_sampler * chain = llama_sampler_chain_init(lparams);
    bool grammar = false;
    std::vector<llama_sampler *> samplers;
    if (params.grammar.compare(0, 11, "%llguidance") == 0) {
 #ifdef LLAMA_USE_LLGUIDANCE
-        grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
+        samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()));
        grammar = true;
 #else
        GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
 #endif // LLAMA_USE_LLGUIDANCE
    } else {
        std::vector<std::string> trigger_patterns;
        std::vector<std::string> patterns_anywhere;
        std::vector<llama_token> trigger_tokens;
        for (const auto & trigger : params.grammar_triggers) {
            switch (trigger.type) {
                case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
                {
                    const auto & word = trigger.value;
-                    trigger_patterns.push_back(regex_escape(word));
+                    patterns_anywhere.push_back(regex_escape(word));
                    break;
                }
                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
                {
-                    trigger_patterns.push_back(trigger.value);
+                    patterns_anywhere.push_back(trigger.value);
                    break;
                }
                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
                {
-                    const auto & pattern = trigger.value;
+                    trigger_patterns.push_back(trigger.value);
                    std::string anchored = "^$";
                    if (!pattern.empty()) {
                        anchored = (pattern.front() != '^' ? "^" : "")
                            + pattern
                            + (pattern.back() != '$' ? "$" : "");
                    }
                    trigger_patterns.push_back(anchored);
                    break;
                }
                case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
@@ -233,6 +212,10 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
            }
        }
        if (!patterns_anywhere.empty()) {
            trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
        }
        std::vector<const char *> trigger_patterns_c;
        trigger_patterns_c.reserve(trigger_patterns.size());
        for (const auto & regex : trigger_patterns) {
@@ -241,12 +224,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
        if (!params.grammar.empty()) {
             if (params.grammar_lazy) {
-                 grmr = llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
+                 samplers.push_back(
-                         trigger_patterns_c.data(), trigger_patterns_c.size(),
+                         llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
-                         trigger_tokens.data(), trigger_tokens.size());
+                             trigger_patterns_c.data(), trigger_patterns_c.size(),
                             trigger_tokens.data(),     trigger_tokens.size()));
             } else {
-                 grmr = llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
+                 samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"));
             }
             grammar = true;
        }
    }
@@ -255,9 +241,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
    }
    if (params.mirostat == 0) {
        bool use_adaptive_p = false; // see below
        for (const auto & cnstr : params.samplers) {
            switch (cnstr) {
                case COMMON_SAMPLER_TYPE_DRY:
@@ -267,54 +250,43 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
                        for (const auto & str : params.dry_sequence_breakers) {
                            c_breakers.push_back(str.c_str());
                        }
-                        samplers.push_back(llama_sampler_init_dry(vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
+
                        samplers.push_back(llama_sampler_init_dry    (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
                    }
                    break;
                case COMMON_SAMPLER_TYPE_TOP_K:
-                    samplers.push_back(llama_sampler_init_top_k(params.top_k));
+                    samplers.push_back(llama_sampler_init_top_k      (params.top_k));
                    break;
                case COMMON_SAMPLER_TYPE_TOP_P:
-                    samplers.push_back(llama_sampler_init_top_p(params.top_p, params.min_keep));
+                    samplers.push_back(llama_sampler_init_top_p      (params.top_p, params.min_keep));
                    break;
                case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
                    samplers.push_back(llama_sampler_init_top_n_sigma(params.top_n_sigma));
                    break;
                case COMMON_SAMPLER_TYPE_MIN_P:
-                    samplers.push_back(llama_sampler_init_min_p(params.min_p, params.min_keep));
+                    samplers.push_back(llama_sampler_init_min_p      (params.min_p, params.min_keep));
                    break;
                case COMMON_SAMPLER_TYPE_XTC:
-                    samplers.push_back(llama_sampler_init_xtc(params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
+                    samplers.push_back(llama_sampler_init_xtc        (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
                    break;
                case COMMON_SAMPLER_TYPE_TYPICAL_P:
-                    samplers.push_back(llama_sampler_init_typical(params.typ_p, params.min_keep));
+                    samplers.push_back(llama_sampler_init_typical    (params.typ_p, params.min_keep));
                    break;
                case COMMON_SAMPLER_TYPE_TEMPERATURE:
-                    samplers.push_back(llama_sampler_init_temp_ext(params.temp, params.dynatemp_range, params.dynatemp_exponent));
+                    samplers.push_back(llama_sampler_init_temp_ext   (params.temp, params.dynatemp_range, params.dynatemp_exponent));
                    break;
                case COMMON_SAMPLER_TYPE_INFILL:
-                    samplers.push_back(llama_sampler_init_infill(vocab));
+                    samplers.push_back(llama_sampler_init_infill     (vocab));
                    break;
                case COMMON_SAMPLER_TYPE_PENALTIES:
-                    samplers.push_back(llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
+                    samplers.push_back(llama_sampler_init_penalties  (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
                    break;
                case COMMON_SAMPLER_TYPE_ADAPTIVE_P:
                    // the `adaptive-p` sampler is like `dist` and `mirostat` in that it selects
                    // a single token, so we will add `dist` at the end of the chain by default,
                    // unless the user specifically included `adaptive-p`. we set this flag here
                    // so we know to add the sampler at the very end.
                    use_adaptive_p = true;
                    break;
                default:
                    GGML_ASSERT(false && "unknown sampler type");
            }
        }
-        if (use_adaptive_p) {
+
-            // only if user explicitly included adaptive-p sampler
+        samplers.push_back(llama_sampler_init_dist(params.seed));
            samplers.push_back(llama_sampler_init_adaptive_p(params.adaptive_target, params.adaptive_decay, params.seed));
        } else {
            // default: sample from distribution
            samplers.push_back(llama_sampler_init_dist(params.seed));
        }
    } else if (params.mirostat == 1) {
        samplers.push_back(llama_sampler_init_temp(params.temp));
        samplers.push_back(llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
@@ -329,16 +301,10 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
        llama_sampler_chain_add(chain, smpl);
    }
    if (grmr && params.backend_sampling) {
        LOG_WRN("%s: backend sampling is not compatible with grammar, disabling\n", __func__);
        params.backend_sampling = false;
    }
    auto * result = new common_sampler {
        /* .params  = */ params,
        /* .grmr    = */ grmr,
        /* .chain   = */ chain,
        /* .grammar = */ grammar,
        /* .prev    = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
        /* .cur     = */ {},
        /* .cur_p   = */ {},
@@ -348,45 +314,47 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
 }
 void common_sampler_free(struct common_sampler * gsmpl) {
-    if (!gsmpl) {
+    if (gsmpl) {
-        return;
+        llama_sampler_free(gsmpl->chain);
        delete gsmpl;
    }
    llama_sampler_free(gsmpl->grmr);
    llama_sampler_free(gsmpl->chain);
    delete gsmpl;
 }
 void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
    if (!gsmpl) {
        return;
    }
    const auto tm = gsmpl->tm();
-    if (gsmpl->grmr && accept_grammar) {
+    if (gsmpl->grammar) {
-        llama_sampler_accept(gsmpl->grmr, token);
+        const int n_smpl = llama_sampler_chain_n(gsmpl->chain);
    }
-    llama_sampler_accept(gsmpl->chain, token);
+        for (int i = 0; i < n_smpl; i++) {
            auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
            // the grammar sampler is always the first one
            if (i == 0) {
                if (accept_grammar) {
                    llama_sampler_accept(smpl, token);
                }
            } else {
                llama_sampler_accept(smpl, token);
            }
        }
    } else {
        llama_sampler_accept(gsmpl->chain, token);
    }
    gsmpl->prev.push_back(token);
 }
 void common_sampler_reset(struct common_sampler * gsmpl) {
    if (!gsmpl) {
        return;
    }
    gsmpl->reset();
 }
 struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
    return new common_sampler {
        /* .params  = */ gsmpl->params,
        /* .grmr    = */ llama_sampler_clone(gsmpl->grmr),
        /* .chain   = */ llama_sampler_clone(gsmpl->chain),
        /* .grammar = */ gsmpl->grammar,
        /* .prev    = */ gsmpl->prev,
        /* .cur     = */ gsmpl->cur,
        /* .cur_p   = */ gsmpl->cur_p,
@@ -439,14 +407,10 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
 }
 struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
    if (!gsmpl) {
        return nullptr;
    }
    return gsmpl->chain;
 }
-llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
    llama_synchronize(ctx);
    // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
@@ -454,61 +418,11 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
    llama_token id = LLAMA_TOKEN_NULL;
    auto & grmr  = gsmpl->grmr;
    auto & chain = gsmpl->chain;
    auto & cur_p = gsmpl->cur_p; // initialized by set_logits
    // Check if a backend sampler has already sampled a token in which case we
    // return that token id directly.
    {
        id = llama_get_sampled_token_ith(ctx, idx);
        if (id != LLAMA_TOKEN_NULL) {
            LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
            GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
            // TODO: simplify
            gsmpl->cur.resize(1);
            gsmpl->cur[0] = { id, 0.0f, 1.0f };
            cur_p = { gsmpl->cur.data(), gsmpl->cur.size(), 0, true };
            return id;
        }
    }
    gsmpl->set_logits(ctx, idx);
    if (grammar_first) {
        llama_sampler_apply(grmr, &cur_p);
    }
    llama_sampler_apply(chain, &cur_p);
    id = cur_p.data[cur_p.selected].id;
    if (grammar_first) {
        return id;
    }
    // check if it the sampled token fits the grammar (grammar-based rejection sampling)
    {
        llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
        llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
        llama_sampler_apply(grmr, &single_token_data_array);
        const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
        if (is_valid) {
            return id;
        }
    }
    // resampling:
    // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
    gsmpl->set_logits(ctx, idx);
    llama_sampler_apply(grmr,  &cur_p);
    llama_sampler_apply(chain, &cur_p);
    GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
@@ -518,7 +432,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
    return id;
 }
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
    GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
    std::vector<llama_token> result;
@@ -526,7 +440,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
    size_t i = 0;
    for (; i < draft.size(); i++) {
-        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
        common_sampler_accept(gsmpl, id, true);
@@ -538,7 +452,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
    }
    if (i == draft.size()) {
-        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
        common_sampler_accept(gsmpl, id, true);
@@ -548,13 +462,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
    return result;
 }
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
    std::vector<int> idxs(draft.size() + 1);
    for (size_t i = 0; i < idxs.size(); ++i) {
        idxs[i] = i;
    }
-    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
+    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
 }
 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
@@ -639,7 +553,6 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
        case COMMON_SAMPLER_TYPE_XTC:         return 'x';
        case COMMON_SAMPLER_TYPE_INFILL:      return 'i';
        case COMMON_SAMPLER_TYPE_PENALTIES:   return 'e';
        case COMMON_SAMPLER_TYPE_ADAPTIVE_P:  return 'a';
        default : return '?';
    }
 }
@@ -656,7 +569,6 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
        case COMMON_SAMPLER_TYPE_XTC:         return "xtc";
        case COMMON_SAMPLER_TYPE_INFILL:      return "infill";
        case COMMON_SAMPLER_TYPE_PENALTIES:   return "penalties";
        case COMMON_SAMPLER_TYPE_ADAPTIVE_P:  return "adaptive_p";
        default : return "";
    }
 }
@@ -673,7 +585,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
        { "xtc",         COMMON_SAMPLER_TYPE_XTC },
        { "infill",      COMMON_SAMPLER_TYPE_INFILL },
        { "penalties",   COMMON_SAMPLER_TYPE_PENALTIES },
        { "adaptive_p",  COMMON_SAMPLER_TYPE_ADAPTIVE_P },
    };
    // since samplers names are written multiple ways
@@ -689,7 +600,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
        { "typ",         COMMON_SAMPLER_TYPE_TYPICAL_P },
        { "min-p",       COMMON_SAMPLER_TYPE_MIN_P },
        { "temp",        COMMON_SAMPLER_TYPE_TEMPERATURE },
        { "adaptive-p",  COMMON_SAMPLER_TYPE_ADAPTIVE_P },
    };
    std::vector<common_sampler_type> samplers;
@@ -726,7 +636,6 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC),         COMMON_SAMPLER_TYPE_XTC },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL),      COMMON_SAMPLER_TYPE_INFILL },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES),   COMMON_SAMPLER_TYPE_PENALTIES },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_ADAPTIVE_P),  COMMON_SAMPLER_TYPE_ADAPTIVE_P },
    };
    std::vector<common_sampler_type> samplers;
--- a/llama/llama.cpp/common/sampling.h
+++ b/llama/llama.cpp/common/sampling.h
@@ -36,8 +36,7 @@ struct common_sampler;
 // llama_sampler API overloads
-// note: can mutate params in some cases
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
 struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params);
 void common_sampler_free(struct common_sampler * gsmpl);
@@ -49,7 +48,6 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
 // arguments can be nullptr to skip printing
 void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
 // get the underlying llama_sampler_chain
 struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
 // extended sampling implementation:
@@ -59,10 +57,7 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
 // - check if the token fits the grammar (if any)
 // - if not: resample by first applying the grammar constraints and then sampling again (slower path)
 //
-// if grammar_first is true, the grammar is applied before the samplers (slower)
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);
 // useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
 //
 llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
 // generalized version of common_sampler_sample
 //
@@ -80,10 +75,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
 //
 // returns at least 1 token, up to idxs.size()
 //
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);
 // assume idxs == [ 0, 1, 2, ..., draft.size() ]
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);
 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
--- a/llama/llama.cpp/include/llama-cpp.h
+++ b/llama/llama.cpp/include/llama-cpp.h
@@ -21,9 +21,7 @@ struct llama_sampler_deleter {
 };
 struct llama_adapter_lora_deleter {
-    void operator()(llama_adapter_lora *) {
+    void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }
        // llama_adapter_lora_free is deprecated
    }
 };
 typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
--- a/llama/llama.cpp/include/llama.h
+++ b/llama/llama.cpp/include/llama.h
@@ -286,7 +286,7 @@ extern "C" {
        // NULL-terminated list of buffer types to use for tensors that match a pattern
        const struct llama_model_tensor_buft_override * tensor_buft_overrides;
-        int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers
+        int32_t n_gpu_layers; // number of layers to store in VRAM
        enum llama_split_mode split_mode; // how to split the model across multiple GPUs
        // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
@@ -309,7 +309,6 @@ extern "C" {
        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool vocab_only;      // only load the vocabulary, no weights
        bool use_mmap;        // use mmap if possible
        bool use_direct_io;   // use direct io, takes precedence over use_mmap
        bool use_mlock;       // force system to keep model in RAM
        bool check_tensors;   // validate model tensor data
        bool use_extra_bufts; // use extra buffer types (used for weight repacking)
@@ -317,11 +316,6 @@ extern "C" {
        bool no_alloc;        // only load metadata and simulate memory allocations
    };
    struct llama_sampler_seq_config {
        llama_seq_id           seq_id;
        struct llama_sampler * sampler;
    };
    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
    //       https://github.com/ggml-org/llama.cpp/pull/7544
    struct llama_context_params {
@@ -370,12 +364,6 @@ extern "C" {
        bool kv_unified;  // use a unified buffer across the input sequences when computing the attention
                          // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
                          // ref: https://github.com/ggml-org/llama.cpp/pull/14363
        // [EXPERIMENTAL]
        // backend sampler chain configuration (make sure the caller keeps the sampler chains alive)
        // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
        struct llama_sampler_seq_config * samplers;
        size_t                            n_samplers;
    };
    // model quantization parameters
@@ -479,24 +467,16 @@ extern "C" {
    // Frees all allocated memory
    LLAMA_API void llama_free(struct llama_context * ctx);
    enum llama_params_fit_status {
        LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
        LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
        LLAMA_PARAMS_FIT_STATUS_ERROR   = 2, // a hard error occured, e.g. because no model could be found at the specified path
    };
    // fits mparams and cparams to free device memory (assumes system memory is unlimited)
-    //   - returns true if the parameters could be successfully modified to fit device memory
+    // returns true if the parameters could be successfully modified to fit device memory
-    //   - this function is NOT thread safe because it modifies the global llama logger state
+    // this function is NOT thread safe because it modifies the global llama logger state
-    //   - only parameters that have the same value as in llama_default_model_params are modified
+    LLAMA_API bool llama_params_fit(
    //     with the exception of the context size which is modified if and only if equal to 0
    LLAMA_API enum llama_params_fit_status llama_params_fit(
                                   const char   * path_model,
                    struct llama_model_params   * mparams,
                    struct llama_context_params * cparams,
                                          float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
        struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
-                                         size_t * margins,               // margins of memory to leave per device in bytes
+                                         size_t   margin,                // margin of memory to leave per device in bytes
                                       uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
                            enum ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log
@@ -537,7 +517,6 @@ extern "C" {
    LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_embd     (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
@@ -621,8 +600,6 @@ extern "C" {
    //
    // Load a LoRA adapter from file
    // The adapter is valid as long as the associated model is not freed
    // All adapters must be loaded before context creation
    LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
            struct llama_model * model,
            const char * path_lora);
@@ -647,8 +624,7 @@ extern "C" {
    // Manually free a LoRA adapter
    // NOTE: loaded adapters will be free when the associated model is deleted
-    LLAMA_API DEPRECATED(void llama_adapter_lora_free(struct llama_adapter_lora * adapter),
+    LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
            "adapters are now freed together with the associated model");
    // Get the invocation tokens if the current lora is an alora
    LLAMA_API uint64_t            llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
@@ -1007,32 +983,6 @@ extern "C" {
    // otherwise: float[n_embd] (1-dimensional)
    LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
    //
    // backend sampling API [EXPERIMENTAL]
    // note: use only if the llama_context was created with at least one llama_sampler_seq_config
    //
    // Get the backend sampled token for the ith token.
    // Returns LLAMA_TOKEN_NULL if no token was sampled.
    LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
    // Get the backend sampled probabilites for the ith token
    // The index matches llama_get_sampled_token_ith().
    // Returns NULL if no probabilites were generated.
    LLAMA_API float *  llama_get_sampled_probs_ith      (struct llama_context * ctx, int32_t i);
    LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
    // Get the backend sampled logits for the ith token
    // Returns NULL if no logits were sampled.
    LLAMA_API float *  llama_get_sampled_logits_ith      (struct llama_context * ctx, int32_t i);
    LLAMA_API uint32_t llama_get_sampled_logits_count_ith(struct llama_context * ctx, int32_t i);
    // Get the backend sampled candidates (token ids) for the ith token
    // These are needed to map probability/logit indices to vocab token ids.
    // Returns NULL if no candidates were sampled.
    LLAMA_API llama_token * llama_get_sampled_candidates_ith      (struct llama_context * ctx, int32_t i);
    LLAMA_API uint32_t      llama_get_sampled_candidates_count_ith(struct llama_context * ctx, int32_t i);
    //
    // Vocab
    //
@@ -1204,16 +1154,11 @@ extern "C" {
    //
    //    llama_sampler_free(smpl);
    //
    // TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
    //
    typedef void * llama_sampler_context_t;
    struct llama_sampler_data {
        struct ggml_tensor * logits;
        struct ggml_tensor * probs;
        struct ggml_tensor * sampled;
        struct ggml_tensor * candidates;
    };
    // user code can implement the interface below in order to create custom llama_sampler
    struct llama_sampler_i {
        const char *           (*name)  (const struct llama_sampler * smpl);                                 // can be NULL
@@ -1223,44 +1168,17 @@ extern "C" {
        struct llama_sampler * (*clone) (const struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
        void                   (*free)  (      struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
-        // [EXPERIMENTAL]
+        // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
-        // backend sampling interface:
+        //void (*apply_ggml) (struct llama_sampler * smpl, ...);
        // return true if the backend supports all ops needed by the sampler
        // note: call once per sampler
        bool (*backend_init)(struct llama_sampler * smpl, ggml_backend_buffer_type_t buft);
        // call after .backend_apply()
        void (*backend_accept)(
                struct llama_sampler * smpl,
                struct ggml_context  * ctx,
                struct ggml_cgraph   * gf,
                struct ggml_tensor   * selected_token);
        // call after .backend_init()
        void (*backend_apply)(
                struct llama_sampler      * smpl,
                struct ggml_context       * ctx,
                struct ggml_cgraph        * gf,
                struct llama_sampler_data * data);
        // called before graph execution to set inputs for the current ubatch
        void (*backend_set_input)(struct llama_sampler * smpl);
    };
    struct llama_sampler {
-        struct llama_sampler_i * iface;
+        const struct llama_sampler_i * iface;
-
+        llama_sampler_context_t        ctx;
        llama_sampler_context_t ctx;
    };
    // [EXPERIMENTAL]
    // attach a sampler to the context
    // note: prefer initializing the context with llama_context_params.samplers when possible
    LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
    // mirror of llama_sampler_i:
-    LLAMA_API struct llama_sampler * llama_sampler_init  (      struct llama_sampler_i * iface, llama_sampler_context_t ctx);
+    LLAMA_API struct llama_sampler * llama_sampler_init  (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
    LLAMA_API const char *           llama_sampler_name  (const struct llama_sampler * smpl);
    LLAMA_API void                   llama_sampler_accept(      struct llama_sampler * smpl, llama_token token);
    LLAMA_API void                   llama_sampler_apply (      struct llama_sampler * smpl, llama_token_data_array * cur_p);
@@ -1276,15 +1194,7 @@ extern "C" {
    // important: takes ownership of the sampler object and will free it when llama_sampler_free is called
    LLAMA_API void                   llama_sampler_chain_add(      struct llama_sampler * chain, struct llama_sampler * smpl);
-
+    LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
    // return NULL if:
    //   - the sampler is NULL
    //   - the sampler is not a llama_sampler_chain
    //   - the index is out of bounds, unless i == -1
    //   - if i == -1, returns the chain itself (can be used to check if the sampler is a chain)
    LLAMA_API struct llama_sampler * llama_sampler_chain_get(      struct llama_sampler * chain, int32_t i);
    // the total number of samplers in the chain
    LLAMA_API int                    llama_sampler_chain_n  (const struct llama_sampler * chain);
    // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
@@ -1293,9 +1203,7 @@ extern "C" {
    // available samplers:
    LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
-
+    LLAMA_API struct llama_sampler * llama_sampler_init_dist  (uint32_t seed);
    /// seed == LLAMA_DEFAULT_SEED to use a random seed.
    LLAMA_API struct llama_sampler * llama_sampler_init_dist(uint32_t seed);
    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
    /// Setting k <= 0 makes this a noop
@@ -1396,33 +1304,6 @@ extern "C" {
                          const char ** seq_breakers,
                              size_t    num_breakers);
    /// adaptive-p: select tokens near a configurable target probability over time.
    ///
    /// the adaptive-p sampler transforms the token probability distribution to favor tokens
    /// that fall near a user-configurable probability target.
    ///
    /// internally, the sampler maintains an exponential moving average of the *ORIGINAL*
    /// probabilities of selected tokens at each sampling step. it uses this EMA to compute an
    /// adapted target probability at each sampling step, thus maintaining the desired target
    /// probability over time.
    ///
    /// adaptive-p selects a token ID rather than just mutating candidates, so it must be last
    /// in the sampler chain (like mirostat, dist, greedy).
    ///
    /// only mild truncation before this sampler is recommended. we suggest applying min-p
    /// before adaptive-p as the only other active sampler in the chain.
    ///
    /// @param target select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
    /// @param decay  EMA decay for adaptation; history ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99)
    /// @param seed   RNG seed
    ///
    /// ref: https://github.com/ggml-org/llama.cpp/pull/17927
    ///
    LLAMA_API struct llama_sampler * llama_sampler_init_adaptive_p(
                               float   target,
                               float   decay,
                            uint32_t   seed);
    LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
                             int32_t   n_vocab,
                             int32_t   n_logit_bias,
@@ -1476,12 +1357,12 @@ extern "C" {
    /// @details Build a split GGUF final path for this chunk.
    ///          llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
    //  Returns the split_path length.
-    LLAMA_API int32_t llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int32_t split_no, int32_t split_count);
+    LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
    /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
    ///          llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
    //  Returns the split_prefix length.
-    LLAMA_API int32_t llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int32_t split_no, int32_t split_count);
+    LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
    // Print system information
    LLAMA_API const char * llama_print_system_info(void);
--- a/llama/llama.cpp/src/llama-adapter.cpp
+++ b/llama/llama.cpp/src/llama-adapter.cpp
@@ -411,9 +411,6 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
        }
    }
    // register adapter with model
    model.loras.insert(&adapter);
    LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
 }
@@ -471,8 +468,8 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
    return snprintf(buf, buf_size, "%s", it->second.c_str());
 }
-void llama_adapter_lora_free(llama_adapter_lora *) {
+void llama_adapter_lora_free(llama_adapter_lora * adapter) {
-    // deprecated: adapters are freed by llama_model's destructor
+    delete adapter;
 }
 uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
--- a/llama/llama.cpp/src/llama-adapter.h
+++ b/llama/llama.cpp/src/llama-adapter.h
@@ -77,10 +77,6 @@ struct llama_adapter_lora {
    ~llama_adapter_lora() = default;
    llama_adapter_lora_weight * get_weight(ggml_tensor * w);
    uint32_t get_n_nodes() const {
        return ab_map.size() * 6u; // a, b, scale, add, 2 x mul_mat
    }
 };
 using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;
--- a/llama/llama.cpp/src/llama-arch.cpp
+++ b/llama/llama.cpp/src/llama-arch.cpp
@@ -20,7 +20,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_STARCODER,        "starcoder"        },
    { LLM_ARCH_REFACT,           "refact"           },
    { LLM_ARCH_BERT,             "bert"             },
    { LLM_ARCH_MODERN_BERT,      "modern-bert"      },
    { LLM_ARCH_NOMIC_BERT,       "nomic-bert"       },
    { LLM_ARCH_NOMIC_BERT_MOE,   "nomic-bert-moe"   },
    { LLM_ARCH_NEO_BERT,         "neo-bert"         },
@@ -42,7 +41,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_PHIMOE,           "phimoe"           },
    { LLM_ARCH_PLAMO,            "plamo"            },
    { LLM_ARCH_PLAMO2,           "plamo2"           },
    { LLM_ARCH_PLAMO3,           "plamo3"           },
    { LLM_ARCH_CODESHELL,        "codeshell"        },
    { LLM_ARCH_ORION,            "orion"            },
    { LLM_ARCH_INTERNLM2,        "internlm2"        },
@@ -81,7 +79,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_NEMOTRON_H_MOE,   "nemotron_h_moe"   },
    { LLM_ARCH_EXAONE,           "exaone"           },
    { LLM_ARCH_EXAONE4,          "exaone4"          },
    { LLM_ARCH_EXAONE_MOE,       "exaone-moe"       },
    { LLM_ARCH_RWKV6,            "rwkv6"            },
    { LLM_ARCH_RWKV6QWEN2,       "rwkv6qwen2"       },
    { LLM_ARCH_RWKV7,            "rwkv7"            },
@@ -118,9 +115,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_RND1,             "rnd1"             },
    { LLM_ARCH_PANGU_EMBED,      "pangu-embedded"   },
    { LLM_ARCH_MISTRAL3,         "mistral3"         },
    { LLM_ARCH_MIMO2,            "mimo2"           },
    { LLM_ARCH_LLAMA_EMBED,      "llama-embed"      },
    { LLM_ARCH_MAINCODER,        "maincoder"        },
    { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };
@@ -154,7 +148,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_VOCAB_SIZE,                        "%s.vocab_size"                        },
    { LLM_KV_CONTEXT_LENGTH,                    "%s.context_length"                    },
    { LLM_KV_EMBEDDING_LENGTH,                  "%s.embedding_length"                  },
    { LLM_KV_EMBEDDING_LENGTH_OUT,              "%s.embedding_length_out"              },
    { LLM_KV_FEATURES_LENGTH,                   "%s.features_length"                   },
    { LLM_KV_BLOCK_COUNT,                       "%s.block_count"                       },
    { LLM_KV_LEADING_DENSE_BLOCK_COUNT,         "%s.leading_dense_block_count"         },
@@ -212,7 +205,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ATTENTION_GATE_LORA_RANK,               "%s.attention.gate_lora_rank"               },
    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,       "%s.attention.relative_buckets_count"       },
    { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
    { LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,       "%s.attention.sliding_window_pattern"       },
    { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
    { LLM_KV_ATTENTION_OUTPUT_SCALE,                 "%s.attention.output_scale"                 },
    { LLM_KV_ATTENTION_TEMPERATURE_LENGTH,           "%s.attention.temperature_length"           },
@@ -224,7 +216,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
    { LLM_KV_ROPE_DIMENSION_SECTIONS,       "%s.rope.dimension_sections"              },
    { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
    { LLM_KV_ROPE_FREQ_BASE_SWA,            "%s.rope.freq_base_swa"                   },
    { LLM_KV_ROPE_SCALE_LINEAR,             "%s.rope.scale_linear"                    },
    { LLM_KV_ROPE_SCALING_TYPE,             "%s.rope.scaling.type"                    },
    { LLM_KV_ROPE_SCALING_FACTOR,           "%s.rope.scaling.factor"                  },
@@ -509,7 +500,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
        case LLM_ARCH_LLAMA:
        case LLM_ARCH_DECI:
        case LLM_ARCH_MISTRAL3:
        case LLM_ARCH_LLAMA_EMBED:
            return {
                LLM_TENSOR_TOKEN_EMBD,
                LLM_TENSOR_OUTPUT_NORM,
@@ -791,20 +781,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_CLS,
                LLM_TENSOR_CLS_OUT,
            };
        case LLM_ARCH_MODERN_BERT:
            return {
                LLM_TENSOR_TOKEN_EMBD,
                LLM_TENSOR_TOKEN_EMBD_NORM,
                LLM_TENSOR_OUTPUT_NORM,
                LLM_TENSOR_ATTN_NORM,
                LLM_TENSOR_ATTN_OUT,
                LLM_TENSOR_ATTN_QKV,
                LLM_TENSOR_FFN_DOWN,
                LLM_TENSOR_FFN_UP,
                LLM_TENSOR_FFN_NORM,
                LLM_TENSOR_CLS,
                LLM_TENSOR_CLS_OUT,
            };
        case LLM_ARCH_JINA_BERT_V2:
            return {
                LLM_TENSOR_TOKEN_EMBD,
@@ -954,8 +930,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_ATTN_K_NORM,
                LLM_TENSOR_ATTN_V,
                LLM_TENSOR_ATTN_OUT,
                LLM_TENSOR_ATTN_QKV,
                LLM_TENSOR_ATTN_GATE,
                LLM_TENSOR_FFN_NORM,
                LLM_TENSOR_FFN_GATE_INP,
                LLM_TENSOR_FFN_GATE_EXPS,
@@ -1086,22 +1060,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_ATTN_POST_NORM,
                LLM_TENSOR_FFN_POST_NORM,
            };
        case LLM_ARCH_PLAMO3:
            return {
                LLM_TENSOR_TOKEN_EMBD,
                LLM_TENSOR_OUTPUT_NORM,
                LLM_TENSOR_OUTPUT,
                LLM_TENSOR_ATTN_NORM,
                LLM_TENSOR_ATTN_QKV,
                LLM_TENSOR_ATTN_Q_NORM,
                LLM_TENSOR_ATTN_K_NORM,
                LLM_TENSOR_ATTN_OUT,
                LLM_TENSOR_ATTN_POST_NORM,
                LLM_TENSOR_FFN_NORM,
                LLM_TENSOR_FFN_POST_NORM,
                LLM_TENSOR_FFN_DOWN,
                LLM_TENSOR_FFN_UP,
            };
        case LLM_ARCH_CODESHELL:
            return {
                LLM_TENSOR_TOKEN_EMBD,
@@ -1732,38 +1690,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_FFN_UP,
                LLM_TENSOR_FFN_POST_NORM,
            };
        case LLM_ARCH_EXAONE_MOE:
            return {
                LLM_TENSOR_TOKEN_EMBD,
                LLM_TENSOR_OUTPUT_NORM,
                LLM_TENSOR_OUTPUT,
                LLM_TENSOR_ROPE_FREQS,
                LLM_TENSOR_ATTN_NORM,
                LLM_TENSOR_ATTN_Q,
                LLM_TENSOR_ATTN_Q_NORM,
                LLM_TENSOR_ATTN_K,
                LLM_TENSOR_ATTN_K_NORM,
                LLM_TENSOR_ATTN_V,
                LLM_TENSOR_ATTN_OUT,
                LLM_TENSOR_FFN_NORM,
                LLM_TENSOR_FFN_GATE,
                LLM_TENSOR_FFN_DOWN,
                LLM_TENSOR_FFN_UP,
                LLM_TENSOR_FFN_GATE_INP,
                LLM_TENSOR_FFN_GATE_EXPS,
                LLM_TENSOR_FFN_DOWN_EXPS,
                LLM_TENSOR_FFN_UP_EXPS,
                LLM_TENSOR_FFN_GATE_SHEXP,
                LLM_TENSOR_FFN_UP_SHEXP,
                LLM_TENSOR_FFN_DOWN_SHEXP,
                LLM_TENSOR_FFN_EXP_PROBS_B,
                LLM_TENSOR_NEXTN_EH_PROJ,
                LLM_TENSOR_NEXTN_EMBED_TOKENS,
                LLM_TENSOR_NEXTN_ENORM,
                LLM_TENSOR_NEXTN_HNORM,
                LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
                LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
            };
        case LLM_ARCH_RWKV6:
            return {
                LLM_TENSOR_TOKEN_EMBD,
@@ -2114,7 +2040,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_TOKEN_EMBD,
                LLM_TENSOR_OUTPUT_NORM_LFM2,
                LLM_TENSOR_OUTPUT,
                LLM_TENSOR_DENSE_2_OUT,
            };
        case LLM_ARCH_LFM2MOE:
            return {
@@ -2133,7 +2058,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_SHORTCONV_INPROJ,
                LLM_TENSOR_SHORTCONV_OUTPROJ,
                LLM_TENSOR_TOKEN_EMBD,
-                LLM_TENSOR_OUTPUT_NORM_LFM2,
+                LLM_TENSOR_OUTPUT_NORM,
                LLM_TENSOR_FFN_GATE_INP,
                LLM_TENSOR_FFN_GATE_EXPS,
                LLM_TENSOR_FFN_DOWN_EXPS,
@@ -2249,49 +2174,11 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_VISEXP_FFN_DOWN,
                LLM_TENSOR_VISEXP_FFN_UP,
            };
        case LLM_ARCH_MIMO2:
            return {
                LLM_TENSOR_TOKEN_EMBD,
                LLM_TENSOR_OUTPUT_NORM,
                LLM_TENSOR_OUTPUT,
                LLM_TENSOR_ATTN_NORM,
                LLM_TENSOR_ATTN_Q,
                LLM_TENSOR_ATTN_K,
                LLM_TENSOR_ATTN_V,
                LLM_TENSOR_ATTN_SINKS,
                LLM_TENSOR_ATTN_OUT,
                LLM_TENSOR_FFN_NORM,
                LLM_TENSOR_FFN_GATE,
                LLM_TENSOR_FFN_DOWN,
                LLM_TENSOR_FFN_UP,
                LLM_TENSOR_FFN_GATE_INP,
                LLM_TENSOR_FFN_GATE_EXPS,
                LLM_TENSOR_FFN_DOWN_EXPS,
                LLM_TENSOR_FFN_UP_EXPS,
                LLM_TENSOR_FFN_EXP_PROBS_B,
            };
        case LLM_ARCH_GPTJ:
        case LLM_ARCH_UNKNOWN:
            return {
                LLM_TENSOR_TOKEN_EMBD,
            };
        case LLM_ARCH_MAINCODER:
            return {
                LLM_TENSOR_TOKEN_EMBD,
                LLM_TENSOR_OUTPUT_NORM,
                LLM_TENSOR_OUTPUT,
                LLM_TENSOR_ATTN_NORM,
                LLM_TENSOR_ATTN_Q,
                LLM_TENSOR_ATTN_Q_NORM,
                LLM_TENSOR_ATTN_K,
                LLM_TENSOR_ATTN_K_NORM,
                LLM_TENSOR_ATTN_V,
                LLM_TENSOR_ATTN_OUT,
                LLM_TENSOR_FFN_NORM,
                LLM_TENSOR_FFN_GATE,
                LLM_TENSOR_FFN_DOWN,
                LLM_TENSOR_FFN_UP,
            };
        case LLM_ARCH_SOLAR:
            return {
                LLM_TENSOR_TOKEN_EMBD,
--- a/llama/llama.cpp/src/llama-arch.h
+++ b/llama/llama.cpp/src/llama-arch.h
@@ -24,7 +24,6 @@ enum llm_arch {
    LLM_ARCH_STARCODER,
    LLM_ARCH_REFACT,
    LLM_ARCH_BERT,
    LLM_ARCH_MODERN_BERT,
    LLM_ARCH_NOMIC_BERT,
    LLM_ARCH_NOMIC_BERT_MOE,
    LLM_ARCH_NEO_BERT,
@@ -46,7 +45,6 @@ enum llm_arch {
    LLM_ARCH_PHIMOE,
    LLM_ARCH_PLAMO,
    LLM_ARCH_PLAMO2,
    LLM_ARCH_PLAMO3,
    LLM_ARCH_CODESHELL,
    LLM_ARCH_ORION,
    LLM_ARCH_INTERNLM2,
@@ -85,7 +83,6 @@ enum llm_arch {
    LLM_ARCH_NEMOTRON_H_MOE,
    LLM_ARCH_EXAONE,
    LLM_ARCH_EXAONE4,
    LLM_ARCH_EXAONE_MOE,
    LLM_ARCH_RWKV6,
    LLM_ARCH_RWKV6QWEN2,
    LLM_ARCH_RWKV7,
@@ -122,9 +119,6 @@ enum llm_arch {
    LLM_ARCH_RND1,
    LLM_ARCH_PANGU_EMBED,
    LLM_ARCH_MISTRAL3,
    LLM_ARCH_MIMO2,
    LLM_ARCH_LLAMA_EMBED,
    LLM_ARCH_MAINCODER,
    LLM_ARCH_UNKNOWN,
 };
@@ -158,7 +152,6 @@ enum llm_kv {
    LLM_KV_VOCAB_SIZE,
    LLM_KV_CONTEXT_LENGTH,
    LLM_KV_EMBEDDING_LENGTH,
    LLM_KV_EMBEDDING_LENGTH_OUT,
    LLM_KV_FEATURES_LENGTH,
    LLM_KV_BLOCK_COUNT,
    LLM_KV_LEADING_DENSE_BLOCK_COUNT,
@@ -216,7 +209,6 @@ enum llm_kv {
    LLM_KV_ATTENTION_GATE_LORA_RANK,
    LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
    LLM_KV_ATTENTION_SLIDING_WINDOW,
    LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,
    LLM_KV_ATTENTION_SCALE,
    LLM_KV_ATTENTION_OUTPUT_SCALE,
    LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
@@ -228,7 +220,6 @@ enum llm_kv {
    LLM_KV_ROPE_DIMENSION_COUNT,
    LLM_KV_ROPE_DIMENSION_SECTIONS,
    LLM_KV_ROPE_FREQ_BASE,
    LLM_KV_ROPE_FREQ_BASE_SWA,
    LLM_KV_ROPE_SCALE_LINEAR,
    LLM_KV_ROPE_SCALING_TYPE,
    LLM_KV_ROPE_SCALING_FACTOR,
--- a/llama/llama.cpp/src/llama-chat.cpp
+++ b/llama/llama.cpp/src/llama-chat.cpp
@@ -57,7 +57,6 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "minicpm",           LLM_CHAT_TEMPLATE_MINICPM           },
    { "exaone3",           LLM_CHAT_TEMPLATE_EXAONE_3          },
    { "exaone4",           LLM_CHAT_TEMPLATE_EXAONE_4          },
    { "exaone-moe",        LLM_CHAT_TEMPLATE_EXAONE_MOE        },
    { "rwkv-world",        LLM_CHAT_TEMPLATE_RWKV_WORLD        },
    { "granite",           LLM_CHAT_TEMPLATE_GRANITE           },
    { "gigachat",          LLM_CHAT_TEMPLATE_GIGACHAT          },
@@ -75,7 +74,6 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "seed_oss",          LLM_CHAT_TEMPLATE_SEED_OSS          },
    { "grok-2",            LLM_CHAT_TEMPLATE_GROK_2            },
    { "pangu-embedded",    LLM_CHAT_TEMPLATE_PANGU_EMBED       },
    { "solar-open",        LLM_CHAT_TEMPLATE_SOLAR_OPEN        },
 };
 llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -138,9 +136,6 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
    } else if (tmpl_contains("[gMASK]<sop>")) {
        return LLM_CHAT_TEMPLATE_CHATGLM_4;
    } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
        if (tmpl_contains("<|tool_declare|>")) {
            return LLM_CHAT_TEMPLATE_EXAONE_MOE;
        }
        return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
    } else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
        return LLM_CHAT_TEMPLATE_GLMEDGE;
@@ -221,8 +216,6 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        return LLM_CHAT_TEMPLATE_GROK_2;
    } else if (tmpl_contains(LU8("[unused9]系统：[unused10]"))) {
        return LLM_CHAT_TEMPLATE_PANGU_EMBED;
    } else if (tmpl_contains("<|begin|>") && tmpl_contains("<|end|>") && tmpl_contains("<|content|>")) {
        return LLM_CHAT_TEMPLATE_SOLAR_OPEN;
    }
    return LLM_CHAT_TEMPLATE_UNKNOWN;
 }
@@ -580,22 +573,6 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "[|assistant|]";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_MOE) {
        for (auto message : chat) {
            std::string role(message->role);
            if (role == "system") {
                ss << "<|system|>\n" << trim(message->content) << "<|endofturn|>\n";
            } else if (role == "user") {
                ss << "<|user|>\n" << trim(message->content) << "<|endofturn|>\n";
            } else if (role == "assistant") {
                ss << "<|assistant|>\n" << trim(message->content) << "<|endofturn|>\n";
            } else if (role == "tool") {
                ss << "<|tool|>\n" << trim(message->content) << "<|endofturn|>\n";
            }
        }
        if (add_ass) {
            ss << "<|assistant|>\n";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
        // this template requires the model to have "\n\n" as EOT token
        for (size_t i = 0; i < chat.size(); i++) {
@@ -868,14 +845,6 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "[unused9]助手：";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_SOLAR_OPEN) {
        for (auto message : chat) {
            std::string role(message->role);
            ss << "<|begin|>" << role << "<|content|>" << message->content << "<|end|>";
        }
        if (add_ass) {
            ss << "<|begin|>assistant";
        }
    } else {
        // template not supported
        return -1;
--- a/llama/llama.cpp/src/llama-chat.h
+++ b/llama/llama.cpp/src/llama-chat.h
@@ -36,7 +36,6 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_MINICPM,
    LLM_CHAT_TEMPLATE_EXAONE_3,
    LLM_CHAT_TEMPLATE_EXAONE_4,
    LLM_CHAT_TEMPLATE_EXAONE_MOE,
    LLM_CHAT_TEMPLATE_RWKV_WORLD,
    LLM_CHAT_TEMPLATE_GRANITE,
    LLM_CHAT_TEMPLATE_GIGACHAT,
@@ -55,7 +54,6 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_SEED_OSS,
    LLM_CHAT_TEMPLATE_GROK_2,
    LLM_CHAT_TEMPLATE_PANGU_EMBED,
    LLM_CHAT_TEMPLATE_SOLAR_OPEN,
    LLM_CHAT_TEMPLATE_UNKNOWN,
 };
--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
--- a/llama/llama.cpp/src/llama-context.h
+++ b/llama/llama.cpp/src/llama-context.h
@@ -40,14 +40,6 @@ struct llama_context {
    ~llama_context();
    // reserve a new backend scheduler (if needed)
    // for example, when:
    //   - changing loras
    //   - changing samplers
    //   - changing attention type
    //   - etc.
    void sched_reserve();
    void synchronize();
    const llama_model   & get_model()   const;
@@ -78,18 +70,6 @@ struct llama_context {
    float * get_embeddings_ith(int32_t i);
    float * get_embeddings_seq(llama_seq_id seq_id);
    llama_token * get_sampled_tokens() const;
    llama_token   get_sampled_token_ith(int32_t idx);
    float * get_sampled_logits_ith(int32_t idx);
    size_t  get_sampled_logits_count(int32_t idx);
    float * get_sampled_probs_ith(int32_t idx);
    size_t  get_sampled_probs_count(int32_t idx);
    const llama_token * get_sampled_candidates_ith(int32_t idx);
    size_t get_sampled_candidates_count(int32_t idx);
    void attach_threadpool(
            ggml_threadpool_t threadpool,
            ggml_threadpool_t threadpool_batch);
@@ -212,13 +192,10 @@ private:
    // Make sure enough space is available for outputs.
    // Returns max number of outputs for which space was reserved.
-    uint32_t output_reserve(int32_t n_outputs, const llama_batch & batch);
+    uint32_t output_reserve(int32_t n_outputs);
    void output_reorder();
    // map the output row index `i` to batch index
    int64_t output_resolve_row(int32_t i) const;
    //
    // graph
    //
@@ -236,8 +213,6 @@ public:
    ggml_cgraph * graph_reserve(
        uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);
    bool set_sampler(llama_seq_id seq_id, llama_sampler * sampler);
 private:
    llm_graph_params graph_params(
                        llm_graph_result * res,
@@ -277,31 +252,6 @@ private:
    size_t  embd_size = 0; // capacity (of floats) for embeddings
    float * embd      = nullptr;
    // TODO: simplify
    struct sampling_info {
        std::map<llama_seq_id, llama_sampler *> samplers;
        float       * logits      = nullptr;
        size_t        logits_size = 0;
        llama_token * sampled      = nullptr;
        size_t        sampled_size = 0;
        float       * probs        = nullptr;
        size_t        probs_size   = 0;
        llama_token * candidates   = nullptr;
        size_t        candidates_size = 0;
        std::vector<uint32_t> logits_count;
        std::vector<uint32_t> probs_count;
        std::vector<uint32_t> candidates_count;
        std::vector<llama_token> token_ids_full_vocab;
    };
    sampling_info sampling;
    // sequence embeddings output (map of [n_embd] vectors)
    // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
    std::map<llama_seq_id, std::vector<float>> embd_seq;
@@ -322,8 +272,6 @@ private:
    ggml_backend_sched_ptr sched;
    bool sched_need_reserve = true;
    ggml_backend_t backend_cpu = nullptr;
    std::vector<ggml_backend_ptr> backends;
--- a/llama/llama.cpp/src/llama-cparams.h
+++ b/llama/llama.cpp/src/llama-cparams.h
@@ -30,12 +30,10 @@ struct llama_cparams {
    bool causal_attn;
    bool offload_kqv;
    bool flash_attn;
    bool auto_fa;
    bool no_perf;
    bool warmup;
    bool op_offload;
    bool kv_unified;
    bool pipeline_parallel;
    enum llama_pooling_type pooling_type;
--- a/llama/llama.cpp/src/llama-grammar.cpp
+++ b/llama/llama.cpp/src/llama-grammar.cpp
@@ -369,44 +369,6 @@ static void print_rule(
    fprintf(file, "\n");
 }
 //
 // Regex utilities
 //
 size_t llama_grammar_trigger_pattern::find(const std::string & input) const {
    auto find_start_pos = [](const std::smatch & match) {
        // get from the first matched capturing group to the end of the string
        size_t start = std::string::npos;
        for (auto i = 1u; i < match.size(); i++) {
            if (match.length(i) > 0) {
                start = match.position(i);
                break;
            }
        }
        if (start == std::string::npos) {
            start = match.position(0);
        }
        return start;
    };
    if (!pattern.empty() && pattern.front() == '^' && pattern.back() == '$') {
        // match against the entire input
        std::smatch match;
        if (std::regex_match(input, match, regex)) {
            return find_start_pos(match);
        }
    }
    // search anywhere
    std::smatch match;
    if (std::regex_search(input, match, regex)) {
        return find_start_pos(match);
    }
    return std::string::npos;
 }
 //
 // implementation
 //
@@ -1359,10 +1321,21 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
            grammar.trigger_buffer_positions.push_back(std::make_pair(token, position));
            grammar.trigger_buffer += piece;
            std::smatch match;
            for (const auto & trigger_pattern : grammar.trigger_patterns) {
-                auto start = trigger_pattern.find(grammar.trigger_buffer);
+                if (std::regex_match(grammar.trigger_buffer, match, trigger_pattern.regex)) {
                if (start != std::string::npos) {
                    grammar.awaiting_trigger = false;
                    // get from the first matched capturing group to the end of the string
                    size_t start = std::string::npos;
                    for (auto i = 1u; i < match.size(); i++) {
                        if (match.length(i) > 0) {
                            start = match.position(i);
                            break;
                        }
                    }
                    if (start == std::string::npos) {
                        start = match.position(0);
                    }
                    // replay tokens that overlap with [start, end)
                    for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) {
--- a/llama/llama.cpp/src/llama-grammar.h
+++ b/llama/llama.cpp/src/llama-grammar.h
@@ -130,8 +130,6 @@ struct llama_grammar_parser {
 struct llama_grammar_trigger_pattern {
    std::string pattern;
    std::regex  regex;
    size_t find(const std::string & input) const;
 };
 struct llama_grammar {
--- a/llama/llama.cpp/src/llama-graph.cpp
+++ b/llama/llama.cpp/src/llama-graph.cpp
@@ -7,13 +7,11 @@
 #include "llama-kv-cache.h"
 #include "llama-kv-cache-iswa.h"
 #include "llama-memory-hybrid.h"
 #include "llama-memory-hybrid-iswa.h"
 #include "llama-memory-recurrent.h"
 #include <cassert>
 #include <cmath>
 #include <cstring>
 #include <unordered_set>
 void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
    if (ubatch->token) {
@@ -23,8 +21,7 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
    }
    if (ubatch->embd) {
-        GGML_ASSERT(n_embd == embd->ne[0]);
+        const int64_t n_embd   = embd->ne[0];
        const int64_t n_tokens = ubatch->n_tokens;
        ggml_backend_tensor_set(embd, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(embd));
@@ -34,8 +31,8 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
 bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
    bool res = true;
-    res &= (!params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
+    res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
-    res &= (!params.ubatch.embd)  || (embd   &&   embd->ne[1] == params.ubatch.n_tokens);
+    res &= (!embd   && !params.ubatch.embd)  || (embd   &&   embd->ne[0] == params.ubatch.n_tokens);
    return res;
 }
@@ -65,7 +62,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
 bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) {
    bool res = true;
-    res &= pos->ne[0] == params.ubatch.n_tokens*n_pos_per_embd;
+    res &= pos->ne[0] == params.ubatch.n_tokens;
    return res;
 }
@@ -98,9 +95,11 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
        int32_t * data = (int32_t *) pos_bucket->data;
-        for (int j = 0; j < n_tokens; ++j) {
+        for (int h = 0; h < 1; ++h) {
-            for (int i = 0; i < n_tokens; ++i) {
+            for (int j = 0; j < n_tokens; ++j) {
-                data[j*n_tokens + i] = llama_relative_position_bucket(ubatch->pos[i], ubatch->pos[j], hparams.n_rel_attn_bkts, true);
+                for (int i = 0; i < n_tokens; ++i) {
                    data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch->pos[i], ubatch->pos[j], hparams.n_rel_attn_bkts, true);
                }
            }
        }
    }
@@ -323,32 +322,34 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
    const int64_t n_tokens = ubatch->n_tokens;
    const auto fill_mask = [&](float * data, int n_swa, llama_swa_type swa_type) {
-        for (int i1 = 0; i1 < n_tokens; ++i1) {
+        for (int h = 0; h < 1; ++h) {
-            const llama_seq_id s1 = ubatch->seq_id[i1][0];
+            for (int i1 = 0; i1 < n_tokens; ++i1) {
-            const llama_pos    p1 = ubatch->pos[i1];
+                const llama_seq_id s1 = ubatch->seq_id[i1][0];
                const llama_pos    p1 = ubatch->pos[i1];
-            const uint64_t idst = i1*n_kv;
+                const uint64_t idst = h*(n_kv*n_tokens) + i1*n_kv;
-            for (int i0 = 0; i0 < n_tokens; ++i0) {
+                for (int i0 = 0; i0 < n_tokens; ++i0) {
-                const llama_seq_id s0 = ubatch->seq_id[i0][0];
+                    const llama_seq_id s0 = ubatch->seq_id[i0][0];
-                const llama_pos p0    = ubatch->pos[i0];
+                    const llama_pos p0    = ubatch->pos[i0];
-                // mask different sequences
+                    // mask different sequences
-                if (s0 != s1) {
+                    if (s0 != s1) {
-                    continue;
+                        continue;
                    }
                    // mask future tokens
                    if (cparams.causal_attn && p0 > p1) {
                        continue;
                    }
                    // apply SWA if any
                    if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
                        continue;
                    }
                    data[idst + i0] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
                }
                // mask future tokens
                if (cparams.causal_attn && p0 > p1) {
                    continue;
                }
                // apply SWA if any
                if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
                    continue;
                }
                data[idst + i0] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
            }
        }
    };
@@ -407,27 +408,6 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
    return res;
 }
 void llm_graph_input_attn_k::set_input(const llama_ubatch * ubatch) {
    mctx->set_input_k_idxs(self_k_idxs, ubatch);
    mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
 }
 bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) {
    const auto * mctx = static_cast<const llama_kv_cache_context *>(params.mctx);
    this->mctx = mctx;
    bool res = true;
    res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
    res &= self_kq_mask->ne[0] == mctx->get_n_kv();
    res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
    return res;
 }
 void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
    mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
    mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
@@ -473,19 +453,27 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
    float * data = (float *) cross_kq_mask->data;
-    for (int i = 0; i < n_tokens; ++i) {
+    for (int h = 0; h < 1; ++h) {
-        for (int j = 0; j < n_enc; ++j) {
+        for (int i = 0; i < n_tokens; ++i) {
-            float f = -INFINITY;
+            for (int j = 0; j < n_enc; ++j) {
                float f = -INFINITY;
-            for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
+                for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
-                const llama_seq_id seq_id = ubatch->seq_id[i][s];
+                    const llama_seq_id seq_id = ubatch->seq_id[i][s];
-                if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) {
+                    if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) {
-                    f = 0.0f;
+                        f = 0.0f;
                    }
                }
            }
-            data[i*n_enc + j] = f;
+                data[h*(n_enc*n_tokens) + i*n_enc + j] = f;
            }
        }
        for (int i = n_tokens; i < n_tokens; ++i) {
            for (int j = 0; j < n_enc; ++j) {
                data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
            }
        }
    }
 }
@@ -533,113 +521,6 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
    return res;
 }
 void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) {
    const auto * attn_ctx = mctx->get_attn();
    // base tensors may not be allocated if there are no non-SWA attention layers
    if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) {
        attn_ctx->get_base()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
        attn_ctx->get_base()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
        attn_ctx->get_base()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
    }
    // swa tensors may not be allocated if there are no SWA attention layers
    if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) {
        attn_ctx->get_swa()->set_input_k_idxs(inp_attn->self_k_idxs_swa, ubatch);
        attn_ctx->get_swa()->set_input_v_idxs(inp_attn->self_v_idxs_swa, ubatch);
        attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn);
    }
    const int64_t n_rs = mctx->get_recr()->get_n_rs();
    if (inp_rs->s_copy) {
        GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
        int32_t * data = (int32_t *) inp_rs->s_copy->data;
        // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
        for (uint32_t i = 0; i < n_rs; ++i) {
            data[i] = mctx->get_recr()->s_copy(i);
        }
    }
 }
 bool llm_graph_input_mem_hybrid_iswa::can_reuse(const llm_graph_params & params) {
    const auto * mctx = static_cast<const llama_memory_hybrid_iswa_context *>(params.mctx);
    this->mctx = mctx;
    bool res = true;
    const auto * attn_ctx = mctx->get_attn();
    // base tensors may not be allocated if there are no non-SWA attention layers
    if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) {
        res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens;
      //res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
        res &= inp_attn->self_kq_mask->ne[0] == attn_ctx->get_base()->get_n_kv();
        res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens;
    }
    // swa tensors may not be allocated if there are no SWA attention layers
    if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) {
        res &= inp_attn->self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
      //res &= inp_attn->self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
        res &= inp_attn->self_kq_mask_swa->ne[0] == attn_ctx->get_swa()->get_n_kv();
        res &= inp_attn->self_kq_mask_swa->ne[1] == params.ubatch.n_tokens;
    }
    res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
    res &= inp_rs->s_copy_main->ne[0]  == params.ubatch.n_seqs;
    res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
    res &= inp_rs->head == mctx->get_recr()->get_head();
    res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
    return res;
 }
 void llm_graph_input_sampling::set_input(const llama_ubatch * ubatch) {
    // set the inputs only for the active samplers in the current ubatch
    std::unordered_set<llama_seq_id> active_samplers;
    for (uint32_t i = 0; i < ubatch->n_tokens; i++) {
        if (ubatch->output[i]) {
            llama_seq_id seq_id = ubatch->seq_id[i][0];
            active_samplers.insert(seq_id);
        }
    }
    for (auto seq_id : active_samplers) {
        if (samplers.find(seq_id) == samplers.end()) {
            continue;
        }
        auto & sampler = samplers[seq_id];
        if (sampler->iface->backend_set_input) {
            sampler->iface->backend_set_input(sampler);
        }
    }
 }
 bool llm_graph_input_sampling::can_reuse(const llm_graph_params & params) {
    if (samplers.size() != params.samplers.size()) {
        return false;
    }
    for (const auto & [seq_id, sampler] : params.samplers) {
        if (samplers[seq_id] != sampler) {
            return false;
        }
    }
    return true;
 }
 //
 // llm_graph_result
 //
@@ -656,15 +537,10 @@ int64_t llm_graph_result::get_max_nodes() const {
 }
 void llm_graph_result::reset() {
-    t_inp_tokens  = nullptr;
+    t_tokens      = nullptr;
    t_inp_embd    = nullptr;
    t_logits      = nullptr;
    t_embd        = nullptr;
    t_embd_pooled = nullptr;
    t_sampled.clear();
    t_sampled_probs.clear();
    t_sampled_logits.clear();
    t_candidates.clear();
    params = {};
@@ -689,38 +565,6 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
    }
 }
 void llm_graph_result::set_outputs() {
    if (t_logits != nullptr) {
        ggml_set_output(t_logits);
    }
    if (t_embd != nullptr) {
        ggml_set_output(t_embd);
    }
    if (t_embd_pooled != nullptr) {
        ggml_set_output(t_embd_pooled);
    }
    for (auto & [seq_id, t] : t_sampled) {
        if (t != nullptr) {
            ggml_set_output(t);
        }
    }
    for (auto & [seq_id, t] : t_sampled_probs) {
        if (t != nullptr) {
            ggml_set_output(t);
        }
    }
    for (auto & [seq_id, t] : t_sampled_logits) {
        if (t != nullptr) {
            ggml_set_output(t);
        }
    }
    for (auto & [seq_id, t] : t_candidates) {
        if (t != nullptr) {
            ggml_set_output(t);
        }
    }
 }
 bool llm_graph_result::can_reuse(const llm_graph_params & params) {
    if (!this->params.allow_reuse(params)) {
        if (debug > 1) {
@@ -802,7 +646,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
    loras            (params.loras),
    mctx             (params.mctx),
    cross            (params.cross),
    samplers         (params.samplers),
    cb_func          (params.cb),
    res              (params.res),
    ctx0             (res->get_ctx()),
@@ -1361,29 +1204,17 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
 // input embeddings with optional lora
 ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
-    const int64_t n_embd_inp = hparams.n_embd_inp();
+    const int64_t n_embd = hparams.n_embd_inp();
    const int64_t n_embd     = hparams.n_embd;
-    assert(n_embd_inp >= n_embd);
+    auto inp = std::make_unique<llm_graph_input_embd>();
-    auto inp = std::make_unique<llm_graph_input_embd>(n_embd_inp);
+    ggml_tensor * cur = nullptr;
-    inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
+    if (ubatch.token) {
-    cb(inp->tokens, "inp_tokens", -1);
+        inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
-    ggml_set_input(inp->tokens);
+        //cb(inp->tokens, "inp_tokens", -1);
-    res->t_inp_tokens = inp->tokens;
+        ggml_set_input(inp->tokens);
-
+        res->t_tokens = inp->tokens;
    inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_inp, ubatch.n_tokens);
    cb(inp->embd, "inp_embd", -1);
    ggml_set_input(inp->embd);
    // select one of the 2 inputs, based on the batch contents
    // ref: https://github.com/ggml-org/llama.cpp/pull/18550
    std::array<ggml_tensor *, 2> inps;
    // token embeddings path (ubatch.token != nullptr)
    {
        auto & cur = inps[0];
        cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);
@@ -1404,43 +1235,22 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
            cur = ggml_add(ctx0, cur, inpL_delta);
        }
-
+    } else {
-        if (n_embd_inp != n_embd) {
+        inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
-            cur = ggml_pad(ctx0, cur, hparams.n_embd_inp() - n_embd, 0, 0, 0);
+        ggml_set_input(inp->embd);
        }
    }
    // vector embeddings path (ubatch.embd != nullptr)
    {
        auto & cur = inps[1];
        cur = inp->embd;
    }
    assert(ggml_are_same_shape (inps[0], inps[1]));
    assert(ggml_are_same_stride(inps[0], inps[1]));
    ggml_tensor * cur = ggml_build_forward_select(gf, inps.data(), inps.size(), ubatch.token ? 0 : 1);
    if (n_embd_inp != n_embd) {
        cur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0);
    }
    res->t_inp_embd = cur;
    // For Granite architecture
    if (hparams.f_embedding_scale != 0.0f) {
        cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale);
    }
-    cb(cur, "embd", -1);
+    cb(cur, "inp_embd", -1);
    res->add_input(std::move(inp));
    // make sure the produced embeddings are immediately materialized in the ggml graph
    // ref: https://github.com/ggml-org/llama.cpp/pull/18599
    ggml_build_forward_expand(gf, cur);
    return cur;
 }
@@ -1532,7 +1342,7 @@ ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
    //}
    const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd_inp();
-    const auto n_enc  = !cross->v_embd.empty() ? cross->n_enc  : hparams.n_ctx_train;
+    const auto n_enc  = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
    cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
    ggml_set_input(cur);
@@ -1630,11 +1440,6 @@ ggml_tensor * llm_graph_context::build_attn_mha(
                                  hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
        cb(cur, LLAMA_TENSOR_NAME_FATTN, il);
        if (!cparams.offload_kqv) {
            // all nodes between the KV store and the attention output are run on the CPU
            ggml_backend_sched_set_tensor_backend(sched, cur, backend_cpu);
        }
        ggml_flash_attn_ext_add_sinks(cur, sinks);
        ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);
@@ -1844,11 +1649,9 @@ ggml_tensor * llm_graph_context::build_attn(
        ggml_tensor * v_cur,
        ggml_tensor * kq_b,
        ggml_tensor * sinks,
-        ggml_tensor * v_mla, // TODO: remove
+        ggml_tensor * v_mla,
            float     kq_scale,
            int       il) const {
    GGML_ASSERT(v_mla == nullptr);
    // these nodes are added to the graph together so that they are not reordered
    // by doing so, the number of splits in the graph is reduced
    // expand k later to enable rope fusion which directly writes into k-v cache
@@ -1891,93 +1694,6 @@ ggml_tensor * llm_graph_context::build_attn(
    return cur;
 }
 static std::unique_ptr<llm_graph_input_attn_k> build_attn_inp_k_impl(
           ggml_context * ctx0,
     const llama_ubatch & ubatch,
    const llama_hparams & hparams,
    const llama_cparams & cparams,
    const llama_kv_cache_context * mctx_cur) {
    auto inp = std::make_unique<llm_graph_input_attn_k>(hparams, cparams, mctx_cur);
    {
        GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_iswa for SWA");
        const auto n_kv     = mctx_cur->get_n_kv();
        const auto n_tokens = ubatch.n_tokens;
        const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
        inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
        ggml_set_input(inp->self_kq_mask);
        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
    }
    return inp;
 }
 llm_graph_input_attn_k * llm_graph_context::build_attn_inp_k() const {
    const auto * mctx_cur = static_cast<const llama_kv_cache_context *>(mctx);
    auto inp = build_attn_inp_k_impl(ctx0, ubatch, hparams, cparams, mctx_cur);
    return (llm_graph_input_attn_k *) res->add_input(std::move(inp));
 }
 ggml_tensor * llm_graph_context::build_attn(
        llm_graph_input_attn_k * inp,
        ggml_tensor * wo,
        ggml_tensor * wo_b,
        ggml_tensor * q_cur,
        ggml_tensor * k_cur,
        ggml_tensor * v_cur,
        ggml_tensor * kq_b,
        ggml_tensor * sinks,
        ggml_tensor * v_mla,
            float     kq_scale,
            int       il) const {
    // these nodes are added to the graph together so that they are not reordered
    // by doing so, the number of splits in the graph is reduced
    // expand k later to enable rope fusion which directly writes into k-v cache
    ggml_build_forward_expand(gf, q_cur);
    ggml_build_forward_expand(gf, v_cur);
    ggml_build_forward_expand(gf, k_cur);
    const auto * mctx_cur = inp->mctx;
    // store to KV cache
    {
        const auto & k_idxs = inp->get_k_idxs();
        ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, k_idxs, il));
    }
    const auto & kq_mask = inp->get_kq_mask();
    ggml_tensor * q = q_cur;
    ggml_tensor * k = mctx_cur->get_k(ctx0, il);
    ggml_tensor * v = ggml_view_4d(ctx0, k, v_cur->ne[0], k->ne[1], k->ne[2], k->ne[3], k->nb[1], k->nb[2], k->nb[3], 0);
    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
    cb(cur, "kqv_out", il);
    if (wo) {
        cur = build_lora_mm(wo, cur);
        if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
            // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
            ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
        }
    }
    if (wo_b) {
        cur = ggml_add(ctx0, cur, wo_b);
    }
    return cur;
 }
 ggml_tensor * llm_graph_context::build_attn(
        llm_graph_input_attn_kv_iswa * inp,
        ggml_tensor * wo,
@@ -2118,10 +1834,8 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
        ggml_set_input(inp->self_kq_mask);
        ggml_set_name(inp->self_kq_mask, "self_kq_mask");
        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
        ggml_set_name(inp->self_kq_mask_cnv, "self_kq_mask_cnv");
    }
    {
@@ -2134,10 +1848,8 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
        ggml_set_input(inp->self_kq_mask_swa);
        ggml_set_name(inp->self_kq_mask_swa, "self_kq_mask_swa");
        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
        ggml_set_name(inp->self_kq_mask_swa_cnv, "self_kq_mask_swa_cnv");
    }
    return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
@@ -2273,62 +1985,17 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
    return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
 }
 llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa() const {
    const auto * mctx_cur = static_cast<const llama_memory_hybrid_iswa_context *>(mctx);
    auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
    // build iswa attention input
    const auto * attn_ctx = mctx_cur->get_attn();
    auto inp_attn = std::make_unique<llm_graph_input_attn_kv_iswa>(hparams, cparams, attn_ctx);
    const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
    {
        const auto n_kv = attn_ctx->get_base()->get_n_kv();
        inp_attn->self_k_idxs = attn_ctx->get_base()->build_input_k_idxs(ctx0, ubatch);
        inp_attn->self_v_idxs = attn_ctx->get_base()->build_input_v_idxs(ctx0, ubatch);
        inp_attn->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
        ggml_set_input(inp_attn->self_kq_mask);
        inp_attn->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask, GGML_TYPE_F16) : inp_attn->self_kq_mask;
    }
    {
        const auto n_kv = attn_ctx->get_swa()->get_n_kv();
        inp_attn->self_k_idxs_swa = attn_ctx->get_swa()->build_input_k_idxs(ctx0, ubatch);
        inp_attn->self_v_idxs_swa = attn_ctx->get_swa()->build_input_v_idxs(ctx0, ubatch);
        inp_attn->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
        ggml_set_input(inp_attn->self_kq_mask_swa);
        inp_attn->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask_swa, GGML_TYPE_F16) : inp_attn->self_kq_mask_swa;
    }
    auto inp = std::make_unique<llm_graph_input_mem_hybrid_iswa>(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur);
    return (llm_graph_input_mem_hybrid_iswa *) res->add_input(std::move(inp));
 }
 void llm_graph_context::build_dense_out(
    ggml_tensor * dense_2,
    ggml_tensor * dense_3) const {
-    if (!cparams.embeddings || !(dense_2 || dense_3)) {
+    if (!cparams.embeddings || dense_2 == nullptr || dense_3 == nullptr) {
        return;
    }
    ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
    GGML_ASSERT(cur != nullptr && "missing t_embd_pooled/t_embd");
-    if (dense_2) {
+    cur = ggml_mul_mat(ctx0, dense_2, cur);
-        cur = ggml_mul_mat(ctx0, dense_2, cur);
+    cur = ggml_mul_mat(ctx0, dense_3, cur);
    }
    if (dense_3) {
        cur = ggml_mul_mat(ctx0, dense_3, cur);
    }
    cb(cur, "result_embd_pooled", -1);
    res->t_embd_pooled = cur;
    ggml_build_forward_expand(gf, cur);
@@ -2419,87 +2086,6 @@ void llm_graph_context::build_pooling(
    ggml_build_forward_expand(gf, cur);
 }
 void llm_graph_context::build_sampling() const {
    if (samplers.empty() || !res->t_logits) {
        return;
    }
    auto inp_sampling = std::make_unique<llm_graph_input_sampling>(samplers);
    res->add_input(std::move(inp_sampling));
    std::map<llama_seq_id, int32_t> seq_to_logit_row;
    int32_t logit_row_idx = 0;
    for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
        if (ubatch.output[i]) {
            llama_seq_id seq_id = ubatch.seq_id[i][0];
            seq_to_logit_row[seq_id] = logit_row_idx;
            logit_row_idx++;
        }
    }
    // res->t_logits will contain logits for all tokens that want the logits calculated (logits=1 or output=1)
    GGML_ASSERT(res->t_logits != nullptr && "missing t_logits tensor");
    // add a dummy row of logits
    // this trick makes the graph static, regardless of which samplers are activated
    // this is important in order to minimize graph reallocations
    // TODO: use `ggml_build_forward_select()` when available (https://github.com/ggml-org/llama.cpp/pull/18550)
    ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0);
    for (const auto & [seq_id, sampler] : samplers) {
        const auto it = seq_to_logit_row.find(seq_id);
        // inactive samplers always work on the first row
        const auto row_idx = seq_to_logit_row.find(seq_id) != seq_to_logit_row.end() ? it->second : 0;
        ggml_tensor * logits_seq = ggml_view_1d(ctx0, logits_t, logits_t->ne[0], row_idx * logits_t->nb[1]);
        ggml_format_name(logits_seq, "logits_seq_%d", seq_id);
        struct llama_sampler_data data = {
            /*.logits      =*/ logits_seq,
            /*.probs       =*/ nullptr,
            /*.sampled     =*/ nullptr,
            /*.candidates  =*/ nullptr,
        };
        assert(sampler->iface->backend_apply);
        sampler->iface->backend_apply(sampler, ctx0, gf, &data);
        if (data.sampled != nullptr) {
            res->t_sampled[seq_id] = data.sampled;
            ggml_build_forward_expand(gf, data.sampled);
        }
        if (data.probs != nullptr) {
            res->t_sampled_probs[seq_id] = data.probs;
            ggml_build_forward_expand(gf, data.probs);
        }
        if (data.logits != nullptr) {
            res->t_sampled_logits[seq_id] = data.logits;
            ggml_build_forward_expand(gf, data.logits);
        }
        if (data.candidates != nullptr) {
            res->t_candidates[seq_id] = data.candidates;
            ggml_build_forward_expand(gf, data.candidates);
        }
    }
    // TODO: Call llama_sampler_accept_ggml after all samplers have been applied.
    /*
    for (const auto & [seq_id, sampler] : samplers) {
        if (auto it = res->t_sampled.find(seq_id); it != res->t_sampled.end()) {
            ggml_tensor * selected_token = it->second;
            if (selected_token != nullptr) {
                llama_sampler_accept_ggml(sampler, ctx0, gf, selected_token);
            }
        }
    }
    */
 }
 int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
    // TODO move to hparams if a T5 variant appears that uses a different value
    const int64_t max_distance = 128;
--- a/llama/llama.cpp/src/llama-graph.h
+++ b/llama/llama.cpp/src/llama-graph.h
@@ -10,7 +10,6 @@
 #include <memory>
 #include <set>
 #include <functional>
 #include <map>
 struct ggml_cgraph;
 struct ggml_context;
@@ -24,7 +23,6 @@ class llama_kv_cache_context;
 class llama_kv_cache_iswa_context;
 class llama_memory_recurrent_context;
 class llama_memory_hybrid_context;
 class llama_memory_hybrid_iswa_context;
 // certain models (typically multi-modal) can produce different types of graphs
 enum llm_graph_type {
@@ -106,7 +104,7 @@ using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
 class llm_graph_input_embd : public llm_graph_input_i {
 public:
-    llm_graph_input_embd(int64_t n_embd) : n_embd(n_embd) {}
+    llm_graph_input_embd()          = default;
    virtual ~llm_graph_input_embd() = default;
    void set_input(const llama_ubatch * ubatch) override;
@@ -115,8 +113,6 @@ public:
    ggml_tensor * tokens = nullptr; // I32 [n_batch]
    ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
    const int64_t n_embd = 0;
 };
 class llm_graph_input_pos : public llm_graph_input_i {
@@ -317,39 +313,6 @@ public:
    const llama_kv_cache_context * mctx;
 };
 // V-less input for the KV cache
 // ref: https://github.com/ggml-org/llama.cpp/pull/19067
 class llm_graph_input_attn_k : public llm_graph_input_i {
 public:
    llm_graph_input_attn_k(
            const llama_hparams & hparams,
            const llama_cparams & cparams,
            const llama_kv_cache_context * mctx) :
        hparams(hparams),
        cparams(cparams),
        mctx(mctx) {
    }
    ~llm_graph_input_attn_k() = default;
    void set_input(const llama_ubatch * ubatch) override;
    bool can_reuse(const llm_graph_params & params) override;
    ggml_tensor * get_k_idxs() const { return self_k_idxs; }
    ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
    ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
    ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
    ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]
    const llama_hparams hparams;
    const llama_cparams cparams;
    const llama_kv_cache_context * mctx;
 };
 class llm_graph_input_attn_kv_iswa : public llm_graph_input_i {
 public:
    llm_graph_input_attn_kv_iswa(
@@ -433,46 +396,6 @@ public:
    const llama_memory_hybrid_context * mctx;
 };
 class llm_graph_input_mem_hybrid_iswa : public llm_graph_input_i {
 public:
    llm_graph_input_mem_hybrid_iswa(
            const llama_cparams & cparams,
            std::unique_ptr<llm_graph_input_attn_kv_iswa> inp_attn,
            std::unique_ptr<llm_graph_input_rs>          inp_rs,
            const llama_memory_hybrid_iswa_context *     mctx) :
        inp_attn(std::move(inp_attn)),
        inp_rs(std::move(inp_rs)),
        cparams(cparams),
        mctx(mctx) { }
    virtual ~llm_graph_input_mem_hybrid_iswa() = default;
    void set_input(const llama_ubatch * ubatch) override;
    bool can_reuse(const llm_graph_params & params) override;
    std::unique_ptr<llm_graph_input_attn_kv_iswa> inp_attn;
    std::unique_ptr<llm_graph_input_rs>          inp_rs;
    llm_graph_input_attn_kv_iswa * get_attn() const { return inp_attn.get(); }
    llm_graph_input_rs           * get_recr() const { return inp_rs.get(); }
    const llama_cparams cparams;
    const llama_memory_hybrid_iswa_context * mctx;
 };
 class llm_graph_input_sampling : public llm_graph_input_i {
 public:
    llm_graph_input_sampling(std::map<llama_seq_id, llama_sampler *> samplers) :
        samplers(std::move(samplers)) { }
    virtual ~llm_graph_input_sampling() = default;
    void set_input(const llama_ubatch * ubatch) override;
    bool can_reuse(const llm_graph_params & params) override;
    std::map<llama_seq_id, llama_sampler *> samplers;
 };
 //
 // llm_graph_result
 //
@@ -506,23 +429,6 @@ struct llm_graph_params {
    const llama_memory_context_i * mctx;
    const llama_cross            * cross;
    std::map<llama_seq_id, llama_sampler *> samplers;
    static bool samplers_equal(
          const std::map<llama_seq_id, llama_sampler *> & lhs,
          const std::map<llama_seq_id, llama_sampler *> & rhs) {
        if (lhs.size() != rhs.size()) {
            return false;
        }
        for (const auto & [seq_id, sampler] : lhs) {
            auto it = rhs.find(seq_id);
            if (it == rhs.end() || it->second != sampler) {
                return false;
            }
        }
        return true;
    }
    uint32_t n_outputs;
    llm_graph_cb cb;
@@ -562,36 +468,15 @@ struct llm_graph_params {
            return false;
        }
        if (n_outputs != other.n_outputs) {
            return false;
        }
        if (!samplers_equal(samplers, other.samplers)) {
            return false;
        }
        if (samplers.size() > 0) {
            if (!ubatch.data || !other.ubatch.data) {
                return false;
            }
            // check that the outputs are the same for all samplers
            for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
                if (ubatch.output[i]    != other.ubatch.output[i] ||
                    ubatch.seq_id[i][0] != other.ubatch.seq_id[i][0]) {
                    return false;
                }
            }
        }
        return
            cparams.embeddings  == other.cparams.embeddings  &&
            cparams.causal_attn == other.cparams.causal_attn &&
-            arch  == other.arch  &&
+            arch      == other.arch  &&
-            gtype == other.gtype &&
+            gtype     == other.gtype &&
-            cvec  == other.cvec  &&
+            cvec      == other.cvec  &&
-            loras == other.loras &&
+            loras     == other.loras &&
-            cross == other.cross;
+            cross     == other.cross &&
            n_outputs == other.n_outputs;
    }
 };
@@ -601,7 +486,7 @@ public:
    virtual ~llm_graph_result() = default;
-    ggml_tensor * get_inp_tokens()  const { return t_inp_tokens; }
+    ggml_tensor * get_tokens()      const { return t_tokens; }
    ggml_tensor * get_logits()      const { return t_logits; }
    ggml_tensor * get_embd()        const { return t_embd; }
    ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
@@ -614,7 +499,6 @@ public:
    void reset();
    void set_inputs(const llama_ubatch * ubatch);
    void set_outputs();
    // try to update the existing graph result using the new graph parameters in order to reuse it
    // this can only be done if we determine that the resulting graph using the new graph parameters
@@ -628,17 +512,11 @@ public:
    void set_params(const llm_graph_params & params);
    // important graph nodes
-    ggml_tensor * t_inp_tokens  = nullptr;
+    ggml_tensor * t_tokens      = nullptr;
    ggml_tensor * t_inp_embd    = nullptr; // [n_embd_inp, n_tokens]
    ggml_tensor * t_logits      = nullptr;
    ggml_tensor * t_embd        = nullptr;
    ggml_tensor * t_embd_pooled = nullptr;
    std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
    std::map<llama_seq_id, ggml_tensor*> t_candidates;
    std::map<llama_seq_id, ggml_tensor*> t_sampled;
    std::map<llama_seq_id, ggml_tensor*> t_sampled_probs;
    std::vector<llm_graph_input_ptr> inputs;
    ggml_context_ptr ctx_compute;
@@ -714,8 +592,6 @@ struct llm_graph_context {
    const llama_memory_context_i * mctx;
    const llama_cross            * cross;
    std::map<llama_seq_id, llama_sampler *> samplers;
    const llm_graph_cb & cb_func;
    llm_graph_result * res;
@@ -866,21 +742,6 @@ struct llm_graph_context {
            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
            ggml_tensor * kq_b,
            ggml_tensor * sinks, // [n_head_q]
            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] // TODO: remove
                  float   kq_scale,
                    int   il) const;
    llm_graph_input_attn_k  * build_attn_inp_k() const;
    ggml_tensor * build_attn(
            llm_graph_input_attn_k * inp,
            ggml_tensor * wo,
            ggml_tensor * wo_b,
            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
            ggml_tensor * kq_b,
            ggml_tensor * sinks, // [n_head_q]
            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                  float   kq_scale,
                    int   il) const;
@@ -961,8 +822,6 @@ struct llm_graph_context {
    llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;
    llm_graph_input_mem_hybrid_iswa * build_inp_mem_hybrid_iswa() const;
    //
    // pooling
    //
@@ -973,12 +832,6 @@ struct llm_graph_context {
            ggml_tensor * cls_out,
            ggml_tensor * cls_out_b) const;
    //
    // sampling (backend sampling)
    //
    void build_sampling() const;
    //
    // dense (out)
    //
--- a/llama/llama.cpp/src/llama-hparams.cpp
+++ b/llama/llama.cpp/src/llama-hparams.cpp
@@ -72,10 +72,6 @@ uint32_t llama_hparams::n_embd_inp() const {
    return n_embd_inp;
 }
 uint32_t llama_hparams::n_embd_out() const {
    return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd;
 }
 uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
    const uint32_t n_head_kv = this->n_head_kv(il);
@@ -183,21 +179,6 @@ bool llama_hparams::is_swa(uint32_t il) const {
    GGML_ABORT("fatal error");
 }
 bool llama_hparams::is_mla() const {
    assert((n_embd_head_k_mla_impl == 0 && n_embd_head_v_mla_impl == 0) ||
           (n_embd_head_k_mla_impl != 0 && n_embd_head_v_mla_impl != 0));
    return n_embd_head_k_mla_impl != 0 && n_embd_head_v_mla_impl != 0;
 }
 uint32_t llama_hparams::n_embd_head_k_mla() const {
    return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k;
 }
 uint32_t llama_hparams::n_embd_head_v_mla() const {
    return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v;
 }
 bool llama_hparams::has_kv(uint32_t il) const {
    if (n_layer_kv_from_start >= 0) {
        if (il < (uint32_t) n_layer_kv_from_start) {
@@ -223,6 +204,42 @@ uint32_t llama_hparams::n_layer_kv() const {
    return res;
 }
 bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
    assert(p0 >= 0 && p1 >= 0);
    switch (swa_type) {
        case LLAMA_SWA_TYPE_NONE:
            {
            } break;
        case LLAMA_SWA_TYPE_STANDARD:
            {
                if (p1 - p0 >= (int32_t) n_swa) {
                    return true;
                }
            } break;
        case LLAMA_SWA_TYPE_CHUNKED:
            {
                const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
                if (p0 < pos_chunk_start) {
                    return true;
                }
            } break;
        case LLAMA_SWA_TYPE_SYMMETRIC:
            {
                const int32_t half_n_swa = (int32_t) n_swa / 2;
                const int32_t pos_diff = p1 - p0;
                // Mask if outside the symmetric window
                if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
                    return true;
                }
            } break;
    }
    return false;
 }
 bool llama_hparams::use_mrope() const {
    return rope_sections[0] > 0 && rope_sections[1] > 0;
 }
--- a/llama/llama.cpp/src/llama-hparams.h
+++ b/llama/llama.cpp/src/llama-hparams.h
@@ -3,7 +3,6 @@
 #include "llama.h"
 #include <array>
 #include <cassert>
 // bump if necessary
 #define LLAMA_MAX_LAYERS  512
@@ -53,8 +52,8 @@ struct llama_hparams {
    uint32_t n_rel_attn_bkts = 0;
    // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
-    uint32_t n_embd_head_k_mla_impl = 0;
+    uint32_t n_embd_head_k_mla = 0;
-    uint32_t n_embd_head_v_mla_impl = 0;
+    uint32_t n_embd_head_v_mla = 0;
    // for WavTokenizer
    struct llama_hparams_posnet   posnet;
@@ -108,9 +107,9 @@ struct llama_hparams {
    float    rope_attn_factor = 1.0f;
    float    rope_freq_base_train;
-    float    rope_freq_base_train_swa  = 10000.0f;
+    float    rope_freq_base_train_swa;
    float    rope_freq_scale_train;
-    float    rope_freq_scale_train_swa = 1.0f;
+    float    rope_freq_scale_train_swa;
    uint32_t n_ctx_orig_yarn;
    float    rope_yarn_log_mul = 0.0f;
@@ -126,11 +125,10 @@ struct llama_hparams {
    llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
    // the size of the sliding window (0 - no SWA)
    uint32_t n_swa = 0;
-    // if swa_layers[il] == 1, then layer il is SWA
+    // if swa_layers[il] == true, then layer il is SWA
-    // if swa_layers[il] == 0, then layer il is dense (i.e. non-SWA)
+    // if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
    // by default, all layers are dense
-    // note: using uint32_t type for compatibility reason
+    std::array<bool, LLAMA_MAX_LAYERS> swa_layers;
    std::array<uint32_t, LLAMA_MAX_LAYERS> swa_layers;
    // for State Space Models
    uint32_t ssm_d_conv  = 0;
@@ -165,9 +163,6 @@ struct llama_hparams {
    // for Classifiers
    uint32_t n_cls_out = 1;
    // output embedding dimension (0 = use n_embd)
    uint32_t n_embd_out_impl = 0;
    // llama4 smallthinker
    uint32_t n_moe_layer_step        = 0;
    uint32_t n_no_rope_layer_step    = 4;
@@ -240,9 +235,6 @@ struct llama_hparams {
    // dimension of main + auxiliary input embeddings
    uint32_t n_embd_inp() const;
    // dimension of output embeddings
    uint32_t n_embd_out() const;
    // dimension of key embeddings across all k-v heads
    uint32_t n_embd_k_gqa(uint32_t il = 0) const;
@@ -274,57 +266,15 @@ struct llama_hparams {
    bool is_swa(uint32_t il) const;
    // note: currently only support if either all or none of the layers are MLA
    bool is_mla() const;
    uint32_t n_embd_head_k_mla() const;
    uint32_t n_embd_head_v_mla() const;
    bool has_kv(uint32_t il) const;
    // number of layers for which has_kv() returns true
    uint32_t n_layer_kv() const;
    // note that this function uses different SWA parameters from those in the hparams
    // note: inlined on purpose for performance reasons
    // TODO: think of a better place for this function
    // TODO: pack the SWA params in a struct?
-    static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
+    static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
        assert(p0 >= 0 && p1 >= 0);
        switch (swa_type) {
            case LLAMA_SWA_TYPE_NONE:
                {
                } break;
            case LLAMA_SWA_TYPE_STANDARD:
                {
                    if (p1 - p0 >= (int32_t) n_swa) {
                        return true;
                    }
                } break;
            case LLAMA_SWA_TYPE_CHUNKED:
                {
                    const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
                    if (p0 < pos_chunk_start) {
                        return true;
                    }
                } break;
            case LLAMA_SWA_TYPE_SYMMETRIC:
                {
                    const int32_t half_n_swa = (int32_t) n_swa / 2;
                    const int32_t pos_diff = p1 - p0;
                    // Mask if outside the symmetric window
                    if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
                        return true;
                    }
                } break;
        }
        return false;
    }
    bool use_mrope() const;
 };
--- a/llama/llama.cpp/src/llama-kv-cache.cpp
+++ b/llama/llama.cpp/src/llama-kv-cache.cpp
@@ -97,8 +97,6 @@ llama_kv_cache::llama_kv_cache(
                __func__, hparams.n_embd_v_gqa_max());
    }
    const bool is_mla = hparams.is_mla();
    for (uint32_t il = 0; il < hparams.n_layer; il++) {
        if (!hparams.has_kv(il)) {
            LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
@@ -132,21 +130,18 @@ llama_kv_cache::llama_kv_cache(
            throw std::runtime_error("failed to create ggml context for kv cache");
        }
-        const bool has_k = true;
+        ggml_tensor * k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream);
-        const bool has_v = !is_mla;
+        ggml_tensor * v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream);
-        ggml_tensor * k = has_k ? ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream) : nullptr;
+        ggml_format_name(k, "cache_k_l%d", il);
-        ggml_tensor * v = has_v ? ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream) : nullptr;
+        ggml_format_name(v, "cache_v_l%d", il);
        has_k && ggml_format_name(k, "cache_k_l%d", il);
        has_v && ggml_format_name(v, "cache_v_l%d", il);
        std::vector<ggml_tensor *> k_stream;
        std::vector<ggml_tensor *> v_stream;
        for (uint32_t s = 0; s < n_stream; ++s) {
-            k_stream.push_back(has_k ? ggml_view_2d(ctx, k, n_embd_k_gqa, kv_size, k->nb[1], s*k->nb[2]) : nullptr);
+            k_stream.push_back(ggml_view_2d(ctx, k, n_embd_k_gqa, kv_size, k->nb[1], s*k->nb[2]));
-            v_stream.push_back(has_v ? ggml_view_2d(ctx, v, n_embd_v_gqa, kv_size, v->nb[1], s*v->nb[2]) : nullptr);
+            v_stream.push_back(ggml_view_2d(ctx, v, n_embd_v_gqa, kv_size, v->nb[1], s*v->nb[2]));
        }
        map_layer_ids[il] = layers.size();
@@ -652,10 +647,7 @@ bool llama_kv_cache::update(llama_context * lctx, bool do_shift, const stream_co
                const auto & layer = layers[il];
                ggml_backend_tensor_copy(layer.k_stream[ssrc], layer.k_stream[sdst]);
-
+                ggml_backend_tensor_copy(layer.v_stream[ssrc], layer.v_stream[sdst]);
                if (layer.v_stream[ssrc]) {
                    ggml_backend_tensor_copy(layer.v_stream[ssrc], layer.v_stream[sdst]);
                }
            }
        }
    }
@@ -860,7 +852,7 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
                        const llama_seq_id seq_id_cell = cells.seq_get(idx);
                        // SWA mask
-                        if (llama_hparams::is_masked_swa(n_swa, swa_type, pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
+                        if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
                            can_use = true;
                        }
                    }
@@ -1245,197 +1237,6 @@ void llama_kv_cache::set_input_k_shift(ggml_tensor * dst) const {
    }
 }
 struct args_set_input_kq_mask {
    const llama_hparams & hparams;
    const llama_ubatch  * ubatch;
    const std::vector<llama_kv_cells> & v_cells;
    const std::vector<uint32_t>       & seq_to_stream;
    uint32_t       n_swa;
    llama_swa_type swa_type;
    int64_t n_kv;
    int64_t n_stream;
    int64_t n_tps;
 };
 template<bool causal, bool swa, bool is_2d, bool alibi>
 static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
  //const auto & hparams = args.hparams;
    const auto & ubatch  = args.ubatch;
    const auto & v_cells       = args.v_cells;
    const auto & seq_to_stream = args.seq_to_stream;
    const uint32_t       n_swa    = args.n_swa;
    const llama_swa_type swa_type = args.swa_type;
    const int64_t n_kv     = args.n_kv;
    const int64_t n_stream = args.n_stream;
    const int64_t n_tps    = args.n_tps;
    // the min position in the batch for each sequence
    llama_pos seq_pos_min[LLAMA_MAX_SEQ];
    std::fill(seq_pos_min, seq_pos_min + LLAMA_MAX_SEQ, INT32_MAX);
    for (uint32_t i = 0; i < ubatch->n_tokens; ++i) {
        const llama_seq_id seq_id = ubatch->seq_id[i][0];
        seq_pos_min[seq_id] = std::min(seq_pos_min[seq_id], ubatch->pos[i]);
    }
    for (uint32_t s = 0; s < n_stream; ++s) {
        // bookeeping of the KQ mask cells that could change for other tokens of the same sequence
        std::unordered_map<llama_seq_id, uint32_t>              seq_srct;
        std::unordered_map<llama_seq_id, std::vector<uint32_t>> seq_idxs;
        for (uint32_t ii = 0; ii < n_tps; ++ii) {
            const uint32_t i = s*n_tps + ii;
            const llama_seq_id seq_id = ubatch->seq_id[i][0];
            const auto & cells = v_cells.at(seq_to_stream[seq_id]);
                  llama_pos p0 = -1;
            const llama_pos p1 = ubatch->pos[i];
            // for M-RoPE
            const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
            const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens]   : 0;
            const uint64_t idst = n_kv*i;
            // for tokens of the same sequence, the mask is mostly the same, so we can reuse it
            // the only cells that could change are the ones that are with similar positions as the
            //   ones in the batch (i.e. due to causal masking, SWA, etc.)
            // keep track of those cells and shortcut the loop to save time
            // note: this optimization is not compatible with Alibi position encoding
            // ref:  https://github.com/ggml-org/llama.cpp/pull/18842
            bool prev = false;
            auto & idxs = seq_idxs[seq_id];
            if (!alibi) {
                if (seq_srct.find(seq_id) != seq_srct.end()) {
                    const uint32_t srct = seq_srct[seq_id];
                    const uint64_t idst_prev = n_kv*srct;
                    std::copy(data + idst_prev, data + idst_prev + n_kv, data + idst);
                    prev = true;
                } else {
                    idxs.clear();
                    idxs.reserve(ubatch->n_tokens + n_swa + 32);
                    seq_srct[seq_id] = i;
                }
            }
            for (uint32_t jj = 0; jj < n_kv; ++jj) {
                uint32_t j = jj;
                // we have an exiting mask for this sequence -> update just seq_idxs
                if (!alibi) {
                    if (prev) {
                        if (jj >= idxs.size()) {
                            break;
                        }
                        j = idxs[jj];
                    }
                }
                if (cells.is_empty(j)) {
                    goto skip;
                }
                // mask the token if not the same sequence
                if (!cells.seq_has(j, seq_id)) {
                    goto skip;
                }
                p0 = cells.pos_get(j);
                if (!alibi) {
                    if (!prev) {
                        // record all cells for which: p0 >= seq_pos_min[seq_id] - n_swa - 32
                        if (p0 + (int32_t) (n_swa + 32) >= seq_pos_min[seq_id]) {
                            idxs.push_back(j);
                        }
                    }
                }
                if (causal) {
                    // mask future tokens
                    if (p0 > p1) {
                        goto skip;
                    }
                    // M-RoPE causal mask
                    if (is_2d) {
                        if (p0 == p1) {
                            const auto & p0_ext = cells.ext_get(j);
                            if (p0_ext.is_2d_gt(p1_x, p1_y)) {
                                goto skip;
                            }
                        }
                    }
                }
                // apply SWA if any
                if (swa) {
                    if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
                        goto skip;
                    }
                }
                if (alibi) {
                    data[idst + j] = -std::abs(p0 - p1);
                } else {
                    data[idst + j] = 0.0f;
                }
                continue;
 skip:
                data[idst + j] = -INFINITY;
            }
        }
    }
 }
 template<bool causal, bool swa, bool is_2d>
 static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
    const bool alibi = args.hparams.use_alibi;
    if (alibi) {
        set_input_kq_mask_impl<causal, swa, is_2d, true> (args, data);
    } else {
        set_input_kq_mask_impl<causal, swa, is_2d, false>(args, data);
    }
 }
 template<bool causal, bool swa>
 static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
    const bool is_2d = args.ubatch->is_pos_2d();
    if (is_2d) {
        set_input_kq_mask_impl<causal, swa, true> (args, data);
    } else {
        set_input_kq_mask_impl<causal, swa, false>(args, data);
    }
 }
 template<bool causal>
 static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
    const bool swa = args.swa_type != LLAMA_SWA_TYPE_NONE;
    if (swa) {
        set_input_kq_mask_impl<causal, true> (args, data);
    } else {
        set_input_kq_mask_impl<causal, false>(args, data);
    }
 }
 void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
    const uint32_t n_tokens = ubatch->n_tokens;
@@ -1450,29 +1251,74 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
    // n_tps == n_tokens_per_stream
    const int64_t n_tps = n_tokens/n_stream;
-    //const int64_t t_start = ggml_time_us();
+    std::fill(data, data + ggml_nelements(dst), -INFINITY);
-    const args_set_input_kq_mask args = {
+    // Use only the previous KV cells of the correct sequence for each token of the ubatch.
-        /*.hparams          =*/ hparams,
+    // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
-        /*.ubatch           =*/ ubatch,
+    // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
-        /*.v_cells          =*/ v_cells,
+    //   Causal mask:
-        /*.seq_to_stream    =*/ seq_to_stream,
+    //      xxx-------
-        /*.n_swa            =*/ n_swa,
+    //      xxxx------
-        /*.swa_type         =*/ swa_type,
+    //      xxxxx-----
-        /*.n_kv             =*/ n_kv,
+    //   Non-causal mask:
-        /*.n_stream         =*/ n_stream,
+    //      xxxxx-----
-        /*.n_tps            =*/ n_tps,
+    //      xxxxx-----
-    };
+    //      xxxxx-----
    // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
    // TODO: optimize this section
    for (uint32_t h = 0; h < 1; ++h) {
        for (uint32_t s = 0; s < n_stream; ++s) {
            for (uint32_t ii = 0; ii < n_tps; ++ii) {
                const uint32_t i = s*n_tps + ii;
-    if (causal_attn) {
+                const llama_seq_id seq_id = ubatch->seq_id[i][0];
-        set_input_kq_mask_impl<true> (args, data);
+
-    } else {
+                const auto & cells = v_cells[seq_to_stream[seq_id]];
-        set_input_kq_mask_impl<false>(args, data);
+
                const llama_pos p1 = ubatch->pos[i];
                // for M-RoPE
                const bool is_2d = ubatch->is_pos_2d();
                const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
                const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens]   : 0;
                const uint64_t idst = n_kv*(h*n_stream*n_tps + s*n_tps + ii);
                for (uint32_t j = 0; j < n_kv; ++j) {
                    if (cells.is_empty(j)) {
                        continue;
                    }
                    // mask the token if not the same sequence
                    if (!cells.seq_has(j, seq_id)) {
                        continue;
                    }
                    const llama_pos p0 = cells.pos_get(j);
                    // mask future tokens
                    if (causal_attn && p0 > p1) {
                        continue;
                    }
                    // M-RoPE causal mask
                    if (causal_attn && is_2d && p0 == p1) {
                        const auto & p0_ext = cells.ext_get(j);
                        if (p0_ext.is_2d_gt(p1_x, p1_y)) {
                            continue;
                        }
                    }
                    // apply SWA if any
                    if (is_masked_swa(p0, p1)) {
                        continue;
                    }
                    data[idst + j] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
                }
            }
        }
    }
    //const int64_t t_end = ggml_time_us();
    //LLAMA_LOG_ERROR("%s: kq mask time: %0.3f ms\n", __func__, (t_end - t_start)/1000.0);
 }
 void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
@@ -1524,7 +1370,7 @@ size_t llama_kv_cache::size_v_bytes() const {
    size_t size_v_bytes = 0;
    for (const auto & layer : layers) {
-        size_v_bytes += layer.v ? ggml_nbytes(layer.v) : 0;
+        size_v_bytes += ggml_nbytes(layer.v);
    }
    return size_v_bytes;
@@ -1602,10 +1448,6 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
    const auto & n_embd_head_k = hparams.n_embd_head_k;
  //const auto & n_embd_head_v = hparams.n_embd_head_v;
    const auto & n_rot = hparams.n_rot;
    const auto n_embd_nope = hparams.n_lora_kv > 0 ? n_embd_head_k - n_rot : 0;
    auto inp = std::make_unique<llm_graph_input_k_shift>(this);
    inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream);
@@ -1626,10 +1468,10 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
        ggml_tensor * k =
            ggml_view_3d(ctx, layer.k,
-                n_rot, n_head_kv, get_size()*n_stream,
+                n_embd_head_k, n_head_kv, get_size()*n_stream,
                ggml_row_size(layer.k->type, n_embd_head_k),
                ggml_row_size(layer.k->type, n_embd_k_gqa),
-                ggml_row_size(layer.k->type, n_embd_nope));
+                0);
        ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
@@ -1641,6 +1483,10 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
    return gf;
 }
 bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const {
    return llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1);
 }
 void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
    GGML_UNUSED(flags);
@@ -1806,9 +1652,6 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
            auto * v = layer.v_stream[cr.strm];
            if (!v) {
                continue;
            }
            // Write value type
            const int32_t v_type_i = (int32_t) v->type;
@@ -1835,9 +1678,6 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
            auto * v = layer.v_stream[cr.strm];
            if (!v) {
                continue;
            }
            // Write value type
            const int32_t v_type_i = (int32_t) v->type;
@@ -2041,9 +1881,6 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
            auto * v = layer.v_stream[strm];
            if (!v) {
                continue;
            }
            // Read type of value
            int32_t v_type_i_ref;
@@ -2085,9 +1922,6 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
            auto * v = layer.v_stream[strm];
            if (!v) {
                continue;
            }
            // Read type of value
            int32_t v_type_i_ref;
--- a/llama/llama.cpp/src/llama-kv-cache.h
+++ b/llama/llama.cpp/src/llama-kv-cache.h
@@ -257,6 +257,8 @@ private:
    size_t size_k_bytes() const;
    size_t size_v_bytes() const;
    bool is_masked_swa(llama_pos p0, llama_pos p1) const;
    ggml_tensor * build_rope_shift(
            const llama_cparams & cparams,
                   ggml_context * ctx,
@@ -303,7 +305,7 @@ public:
            bool do_shift,
            stream_copy_info sc_info);
-    // used to create a batch processing context from a batch
+    // used to create a batch procesing context from a batch
    llama_kv_cache_context(
            llama_kv_cache * kv,
            slot_info_vec_t sinfos,
--- a/llama/llama.cpp/src/llama-memory-hybrid-iswa.cpp
+++ b/llama/llama.cpp/src/llama-memory-hybrid-iswa.cpp
@@ -1,275 +0,0 @@
 #include "llama-memory-hybrid-iswa.h"
 #include "llama-impl.h"
 #include "llama-model.h"
 #include "llama-context.h"
 //
 // llama_memory_hybrid_iswa
 //
 llama_memory_hybrid_iswa::llama_memory_hybrid_iswa(
        const llama_model & model,
                            /* attn */
                ggml_type   type_k,
                ggml_type   type_v,
                     bool   v_trans,
                     bool   swa_full,
                 uint32_t   kv_size,
                 uint32_t   n_ubatch,
                 uint32_t   n_pad,
                            /* recurrent */
                ggml_type   type_r,
                ggml_type   type_s,
                 uint32_t   rs_size,
                            /* common */
                 uint32_t   n_seq_max,
                     bool   offload,
                     bool   unified,
                            /* layer filters */
    const layer_filter_cb & filter_attn,
    const layer_filter_cb & filter_recr) :
    hparams(model.hparams),
    mem_attn(new llama_kv_cache_iswa(
        model,
        type_k,
        type_v,
        v_trans,
        offload,
        swa_full,
        unified,
        kv_size,
        n_seq_max,
        n_ubatch,
        n_pad,
        filter_attn == nullptr ?
            [&](int32_t il) { return !hparams.is_recurrent(il); }
            : filter_attn,
        nullptr
    )),
    mem_recr(new llama_memory_recurrent(
        model,
        type_r,
        type_s,
        offload,
        rs_size,
        n_seq_max,
        filter_recr == nullptr ?
            [&](int32_t il) { return hparams.is_recurrent(il); }
            : filter_recr
    )) {}
 llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
    do {
        balloc.split_reset();
        // follow the recurrent pattern for creating the ubatch splits
        std::vector<llama_ubatch> ubatches;
        while (true) {
            llama_ubatch ubatch;
            if (embd_all) {
                // if all tokens are output, split by sequence
                ubatch = balloc.split_seq(n_ubatch);
            } else {
                // TODO: non-sequential equal split can be done if using unified KV cache
                //       for simplicity, we always use sequential equal split for now
                ubatch = balloc.split_equal(n_ubatch, true);
            }
            if (ubatch.n_tokens == 0) {
                break;
            }
            ubatches.push_back(std::move(ubatch)); // NOLINT
        }
        if (balloc.get_n_used() < balloc.get_n_tokens()) {
            // failed to find a suitable split
            break;
        }
        // prepare the recurrent batches first
        if (!mem_recr->prepare(ubatches)) {
            // TODO: will the recurrent cache be in an undefined context at this point?
            LLAMA_LOG_ERROR("%s: failed to prepare recurrent ubatches\n", __func__);
            return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
        }
        // prepare the attention cache (iswa version returns both base and swa slot infos)
        auto sinfos_base = mem_attn->get_base()->prepare(ubatches);
        if (sinfos_base.empty()) {
            LLAMA_LOG_ERROR("%s: failed to prepare attention base ubatches\n", __func__);
            return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
        }
        auto sinfos_swa = mem_attn->get_swa()->prepare(ubatches);
        if (sinfos_swa.empty()) {
            LLAMA_LOG_ERROR("%s: failed to prepare attention swa ubatches\n", __func__);
            return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
        }
        return std::make_unique<llama_memory_hybrid_iswa_context>(
                this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
    } while(false);
    return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
 }
 llama_memory_context_ptr llama_memory_hybrid_iswa::init_full() {
    return std::make_unique<llama_memory_hybrid_iswa_context>(this);
 }
 llama_memory_context_ptr llama_memory_hybrid_iswa::init_update(llama_context * lctx, bool optimize) {
    return std::make_unique<llama_memory_hybrid_iswa_context>(this, lctx, optimize);
 }
 bool llama_memory_hybrid_iswa::get_can_shift() const {
    // Shifting is trivially supported for recurrent
    return mem_attn->get_can_shift();
 }
 void llama_memory_hybrid_iswa::clear(bool data) {
    mem_attn->clear(data);
    mem_recr->clear(data);
 }
 bool llama_memory_hybrid_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
    // Try removing from the recurrent cache first since it may fail. If it does
    // fail, the cache will not have been mutated.
    if (!mem_recr->seq_rm(seq_id, p0, p1)) {
        return false;
    }
    return mem_attn->seq_rm(seq_id, p0, p1);
 }
 void llama_memory_hybrid_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
    mem_attn->seq_cp(seq_id_src, seq_id_dst, p0, p1);
    mem_recr->seq_cp(seq_id_src, seq_id_dst, p0, p1);
 }
 void llama_memory_hybrid_iswa::seq_keep(llama_seq_id seq_id) {
    mem_attn->seq_keep(seq_id);
    mem_recr->seq_keep(seq_id);
 }
 void llama_memory_hybrid_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
    mem_attn->seq_add(seq_id, p0, p1, shift);
    mem_recr->seq_add(seq_id, p0, p1, shift);
 }
 void llama_memory_hybrid_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
    mem_attn->seq_div(seq_id, p0, p1, d);
    mem_recr->seq_div(seq_id, p0, p1, d);
 }
 llama_pos llama_memory_hybrid_iswa::seq_pos_min(llama_seq_id seq_id) const {
    // the min of the total cache is the max of the two caches' min values
    return std::max(mem_attn->seq_pos_min(seq_id), mem_recr->seq_pos_min(seq_id));
 }
 llama_pos llama_memory_hybrid_iswa::seq_pos_max(llama_seq_id seq_id) const {
    // the max of the total cache is the min of the two caches' max values
    return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
 }
 std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid_iswa::memory_breakdown() const {
    std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
    for (const auto & buft_size : mem_recr->memory_breakdown()) {
        mb[buft_size.first] += buft_size.second;
    }
    return mb;
 }
 void llama_memory_hybrid_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
    mem_attn->state_write(io, seq_id, flags);
    mem_recr->state_write(io, seq_id, flags);
 }
 void llama_memory_hybrid_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
    mem_attn->state_read(io, seq_id, flags);
    mem_recr->state_read(io, seq_id, flags);
 }
 llama_kv_cache_iswa * llama_memory_hybrid_iswa::get_mem_attn() const {
    return mem_attn.get();
 }
 llama_memory_recurrent * llama_memory_hybrid_iswa::get_mem_recr() const {
    return mem_recr.get();
 }
 //
 // llama_memory_hybrid_iswa_context
 //
 llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(llama_memory_status status) : status(status) {}
 llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(llama_memory_hybrid_iswa * mem) :
    ctx_attn(mem->get_mem_attn()->init_full()),
    ctx_recr(mem->get_mem_recr()->init_full()),
    status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
 }
 llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(
        llama_memory_hybrid_iswa * mem,
                   llama_context * lctx,
                            bool   optimize) :
    ctx_attn(mem->get_mem_attn()->init_update(lctx, optimize)),
    ctx_recr(mem->get_mem_recr()->init_update(lctx, optimize)),
    status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
 }
 llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(
           llama_memory_hybrid_iswa * mem,
                    slot_info_vec_t   sinfos_base,
                    slot_info_vec_t   sinfos_swa,
          std::vector<llama_ubatch>   ubatches) :
    ubatches(std::move(ubatches)),
    // note: here we copy the ubatches. not sure if this is ideal
    ctx_attn(new llama_kv_cache_iswa_context(mem->get_mem_attn(), std::move(sinfos_base), std::move(sinfos_swa), this->ubatches)),
    ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
    status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
 }
 bool llama_memory_hybrid_iswa_context::next() {
    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
    ctx_attn->next();
    ctx_recr->next();
    if (++i_next >= ubatches.size()) {
        return false;
    }
    return true;
 }
 bool llama_memory_hybrid_iswa_context::apply() {
    assert(!llama_memory_status_is_fail(status));
    bool res = true;
    res = res & ctx_attn->apply();
    res = res & ctx_recr->apply();
    return res;
 }
 llama_memory_status llama_memory_hybrid_iswa_context::get_status() const {
    return status;
 }
 const llama_ubatch & llama_memory_hybrid_iswa_context::get_ubatch() const {
    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
    return ubatches[i_next];
 }
 const llama_kv_cache_iswa_context * llama_memory_hybrid_iswa_context::get_attn() const {
    return static_cast<const llama_kv_cache_iswa_context *>(ctx_attn.get());
 }
 const llama_memory_recurrent_context * llama_memory_hybrid_iswa_context::get_recr() const {
    return static_cast<const llama_memory_recurrent_context *>(ctx_recr.get());
 }
--- a/llama/llama.cpp/src/llama-memory-hybrid-iswa.h
+++ b/llama/llama.cpp/src/llama-memory-hybrid-iswa.h
@@ -1,140 +0,0 @@
 #pragma once
 #include "llama-batch.h"
 #include "llama-graph.h"
 #include "llama-kv-cache-iswa.h"
 #include "llama-memory.h"
 #include "llama-memory-recurrent.h"
 #include <memory>
 #include <vector>
 //
 // llama_memory_hybrid_iswa
 //
 // utilizes instances of llama_memory_recurrent and llama_kv_cache_iswa to
 //   support models where each layer may be either attention-based (with SWA support) or recurrent
 class llama_memory_hybrid_iswa : public llama_memory_i {
 public:
    llama_memory_hybrid_iswa(
        const llama_model & model,
                            /* attn */
                ggml_type   type_k,
                ggml_type   type_v,
                     bool   v_trans,
                     bool   swa_full,
                 uint32_t   kv_size,
                 uint32_t   n_ubatch,
                 uint32_t   n_pad,
                            /* recurrent */
                ggml_type   type_r,
                ggml_type   type_s,
                 uint32_t   rs_size,
                            /* common */
                 uint32_t   n_seq_max,
                     bool   offload,
                     bool   unified,
                            /* layer filters */
    const layer_filter_cb & filter_attn = nullptr,
    const layer_filter_cb & filter_recr = nullptr);
    ~llama_memory_hybrid_iswa() = default;
    //
    // llama_memory_i
    //
    llama_memory_context_ptr init_batch(
            llama_batch_allocr & balloc,
            uint32_t n_ubatch,
            bool embd_all) override;
    llama_memory_context_ptr init_full() override;
    llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;
    bool get_can_shift() const override;
    void clear(bool data) override;
    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
    void seq_keep(llama_seq_id seq_id)                                                          override;
    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
    // state write/load
    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0)       override;
    //
    // llama_memory_hybrid_iswa specific API
    //
    llama_kv_cache_iswa * get_mem_attn() const;
    llama_memory_recurrent * get_mem_recr() const;
 private:
    const llama_hparams & hparams;
    const std::unique_ptr<llama_kv_cache_iswa> mem_attn;
    const std::unique_ptr<llama_memory_recurrent> mem_recr;
 };
 class llama_memory_hybrid_iswa_context : public llama_memory_context_i {
 public:
    using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
    // init failure
    explicit llama_memory_hybrid_iswa_context(llama_memory_status status);
    // init full
    explicit llama_memory_hybrid_iswa_context(llama_memory_hybrid_iswa * mem);
    // init update
    explicit llama_memory_hybrid_iswa_context(
        llama_memory_hybrid_iswa * mem,
                   llama_context * lctx,
                            bool   optimize);
    // init success
    llama_memory_hybrid_iswa_context(
           llama_memory_hybrid_iswa * mem,
                    slot_info_vec_t   sinfos_base,
                    slot_info_vec_t   sinfos_swa,
          std::vector<llama_ubatch>   ubatches);
    ~llama_memory_hybrid_iswa_context() = default;
    bool next()  override;
    bool apply() override;
    llama_memory_status  get_status() const override;
    const llama_ubatch & get_ubatch() const override;
    //
    // llama_memory_hybrid_iswa_context
    //
    const llama_kv_cache_iswa_context * get_attn() const;
    const llama_memory_recurrent_context * get_recr() const;
 private:
    // the index of the next ubatch to process
    size_t i_next = 0;
    std::vector<llama_ubatch> ubatches;
    const llama_memory_context_ptr ctx_attn;
    const llama_memory_context_ptr ctx_recr;
    const llama_memory_status status;
 };
--- a/llama/llama.cpp/src/llama-mmap.cpp
+++ b/llama/llama.cpp/src/llama-mmap.cpp
@@ -13,10 +13,9 @@
 #ifdef __has_include
    #if __has_include(<unistd.h>)
        #include <unistd.h>
        #include <fcntl.h>
        #include <sys/stat.h>
        #if defined(_POSIX_MAPPED_FILES)
            #include <sys/mman.h>
            #include <fcntl.h>
        #endif
        #if defined(_POSIX_MEMLOCK_RANGE)
            #include <sys/resource.h>
@@ -75,7 +74,7 @@ struct llama_file::impl {
        return ret;
    }
-    impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
+    impl(const char * fname, const char * mode) {
        fp = ggml_fopen(fname, mode);
        if (fp == NULL) {
            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
@@ -110,7 +109,7 @@ struct llama_file::impl {
        }
    }
-    void read_raw(void * ptr, size_t len) {
+    void read_raw(void * ptr, size_t len) const {
        size_t bytes_read = 0;
        while (bytes_read < len) {
            size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
@@ -127,7 +126,7 @@ struct llama_file::impl {
        }
    }
-    uint32_t read_u32() {
+    uint32_t read_u32() const {
        uint32_t val;
        read_raw(&val, sizeof(val));
        return val;
@@ -154,55 +153,16 @@ struct llama_file::impl {
        write_raw(&val, sizeof(val));
    }
    bool has_direct_io() const {
        return true;
    }
    ~impl() {
        if (fp) {
            std::fclose(fp);
        }
    }
 #else
-    impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) : fname(fname) {
+    impl(const char * fname, const char * mode) {
-#ifdef __linux__
+        fp = ggml_fopen(fname, mode);
        // Try unbuffered I/O for read only
        if (use_direct_io && std::strcmp(mode, "rb") == 0) {
            if (init_fd()) {
                return;
            }
            LLAMA_LOG_WARN("Failed to open file '%s' with error: %s. Falling back to buffered I/O",
                           fname, strerror(errno));
        }
 #endif
        init_fp(mode);
    }
 #ifdef __linux__
    bool init_fd() {
        fd = open(fname.c_str(), O_RDONLY | O_DIRECT);
        if (fd != -1) {
            struct stat file_stats{};
            fstat(fd, &file_stats);
            size = file_stats.st_size;
            alignment = file_stats.st_blksize;
            off_t ret = lseek(fd, 0, SEEK_SET);
            if (ret == -1) {
                throw std::runtime_error(format("seek error: %s", strerror(errno)));
            }
            return true;
        }
        return false;
    }
 #endif
    void init_fp(const char * mode) {
        fp = ggml_fopen(fname.c_str(), mode);
        if (fp == NULL) {
-            throw std::runtime_error(format("failed to open %s: %s", fname.c_str(), strerror(errno)));
+            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
        }
        seek(0, SEEK_END);
        size = tell();
@@ -210,122 +170,46 @@ struct llama_file::impl {
    }
    size_t tell() const {
-        if (fd == -1) {
+// TODO: this ifdef is never true?
-            long ret = std::ftell(fp);
+#ifdef _WIN32
-            if (ret == -1) {
+        __int64 ret = _ftelli64(fp);
-                throw std::runtime_error(format("ftell error: %s", strerror(errno)));
+#else
-            }
+        long ret = std::ftell(fp);
-
+#endif
-            return (size_t) ret;
+        if (ret == -1) {
            throw std::runtime_error(format("ftell error: %s", strerror(errno)));
        }
-        off_t pos = lseek(fd, 0, SEEK_CUR);
+        return (size_t) ret;
        if (pos == -1) {
            throw std::runtime_error(format("lseek error: %s", strerror(errno)));
        }
        return (size_t) pos;
    }
    void seek(size_t offset, int whence) const {
-        off_t ret = 0;
+// TODO: this ifdef is never true?
-        if (fd == -1) {
+#ifdef _WIN32
-            ret = std::fseek(fp, (long) offset, whence);
+        int ret = _fseeki64(fp, (__int64) offset, whence);
-        } else {
+#else
-            ret = lseek(fd, offset, whence);
+        int ret = std::fseek(fp, (long) offset, whence);
-        }
+#endif
-        if (ret == -1) {
+        if (ret != 0) {
            throw std::runtime_error(format("seek error: %s", strerror(errno)));
        }
    }
-    void read_raw_unsafe(void * ptr, size_t len) {
+    void read_raw(void * ptr, size_t len) const {
        if (len == 0) {
            return;
        }
        errno = 0;
-        if (fd == -1) {
+        std::size_t ret = std::fread(ptr, len, 1, fp);
-            const size_t curr_off = tell();
+        if (ferror(fp)) {
-            const size_t to_read = std::min(len, size - curr_off);
+            throw std::runtime_error(format("read error: %s", strerror(errno)));
-
+        }
-            std::size_t ret = std::fread(ptr, to_read, 1, fp);
+        if (ret != 1) {
-            if (ferror(fp)) {
+            throw std::runtime_error("unexpectedly reached end of file");
                throw std::runtime_error(format("read error: %s", strerror(errno)));
            }
            if (to_read > 0 && ret != 1) {
                throw std::runtime_error("unexpectedly reached end of file");
            }
        } else {
            size_t bytes_read = 0;
            while (bytes_read < len) {
                const size_t to_read = len - bytes_read;
                ssize_t ret = ::read(fd, reinterpret_cast<char *>(ptr) + bytes_read, to_read);
                if (ret == -1) {
                    if (errno == EINTR) {
                        continue;  // Interrupted by signal, retry
                    }
                    // Fallback to std::fread in case the DMA controller cannot access the buffer
                    if (errno == EFAULT || errno == EINVAL) {
                        LLAMA_LOG_WARN("%s: Falling back to buffered IO due to %s\n", __func__, strerror(errno));
                        auto curr_off = tell();
                        close(fd);
                        fd = -1;
                        alignment = 1;
                        init_fp("rb");
                        seek(curr_off, SEEK_SET);
                        read_raw_unsafe(ptr, len);
                        return;
                    }
                    throw std::runtime_error(format("read error: %s", strerror(errno)));
                }
                if (ret == 0) {
                    // EOF: allow if this read was only pulling alignment padding past file end
                    off_t pos = lseek(fd, 0, SEEK_CUR);
                    if (pos != -1 && (size_t) pos == size) {
                        std::memset(reinterpret_cast<char *>(ptr) + bytes_read, 0, len - bytes_read);
                        return;
                    }
                    throw std::runtime_error("unexpectedly reached end of file");
                }
                bytes_read += (size_t) ret;
            }
        }
    }
-    void read_aligned_chunk(void * dest, size_t size) {
+    uint32_t read_u32() const {
        size_t offset = tell();
        off_t aligned_offset = offset & ~(alignment - 1);
        off_t offset_from_alignment = offset - aligned_offset;
        size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
        void * raw_buffer = nullptr;
        int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
        if (ret != 0) {
            throw std::runtime_error(format("posix_memalign failed with error %d", ret));
        }
        struct aligned_buffer_deleter {
            void operator()(void * p) const { free(p); }
        };
        std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
        seek(aligned_offset, SEEK_SET);
        read_raw_unsafe(buffer.get(), bytes_to_read);
        uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
        memcpy(dest, reinterpret_cast<void *>(actual_data), size);
    }
    void read_raw(void * ptr, size_t len) {
        if (has_direct_io()) {
            read_aligned_chunk(ptr, len);
        } else {
            read_raw_unsafe(ptr, len);
        }
    }
    uint32_t read_u32() {
        uint32_t ret;
        read_raw(&ret, sizeof(ret));
        return ret;
@@ -346,48 +230,27 @@ struct llama_file::impl {
        write_raw(&val, sizeof(val));
    }
    bool has_direct_io() const {
        return fd != -1 && alignment > 1;
    }
    ~impl() {
-        if (fd != -1) {
+        if (fp) {
            close(fd);
        } else {
            std::fclose(fp);
        }
    }
    int fd = -1;
    std::string fname;
 #endif
-    size_t read_alignment() const {
+    FILE * fp;
-        return alignment;
+    size_t size;
    }
    size_t alignment = 1;
    FILE * fp{};
    size_t size{};
 };
-llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
+llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
    pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
 llama_file::~llama_file() = default;
 size_t llama_file::tell() const { return pimpl->tell(); }
 size_t llama_file::size() const { return pimpl->size; }
 size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
 bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }
 int llama_file::file_id() const {
 #ifdef _WIN32
    return _fileno(pimpl->fp);
 #else
    if (pimpl->fd != -1) {
        return pimpl->fd;
    }
 #if defined(fileno)
    return fileno(pimpl->fp);
 #else
@@ -397,14 +260,9 @@ int llama_file::file_id() const {
 }
 void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
-void llama_file::read_raw(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
+void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
 #ifdef _WIN32
 void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
 #else
 void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw_unsafe(ptr, len); }
 #endif
-uint32_t llama_file::read_u32() { return pimpl->read_u32(); }
+uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
 void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
 void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
@@ -618,9 +476,9 @@ struct llama_mlock::impl {
        char* errmsg = std::strerror(errno);
        bool suggest = (errno == ENOMEM);
-#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX) || defined(__HAIKU__)
+#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX)
-        // visionOS/tvOS/Haiku don't support RLIMIT_MEMLOCK
+        // visionOS/tvOS dont't support RLIMIT_MEMLOCK
-        // Skip resource limit checks on these platforms
+        // Skip resource limit checks on visionOS/tvOS
        suggest = false;
 #else
        struct rlimit lock_limit;
--- a/llama/llama.cpp/src/llama-mmap.h
+++ b/llama/llama.cpp/src/llama-mmap.h
@@ -3,7 +3,6 @@
 #include <cstdint>
 #include <memory>
 #include <vector>
 #include <cstdio>
 struct llama_file;
 struct llama_mmap;
@@ -14,7 +13,7 @@ using llama_mmaps  = std::vector<std::unique_ptr<llama_mmap>>;
 using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
 struct llama_file {
-    llama_file(const char * fname, const char * mode, bool use_direct_io = false);
+    llama_file(const char * fname, const char * mode);
    ~llama_file();
    size_t tell() const;
@@ -24,16 +23,12 @@ struct llama_file {
    void seek(size_t offset, int whence) const;
-    void read_raw(void * ptr, size_t len);
+    void read_raw(void * ptr, size_t len) const;
-    void read_raw_unsafe(void * ptr, size_t len);
+    uint32_t read_u32() const;
    void read_aligned_chunk(void * dest, size_t size);
    uint32_t read_u32();
    void write_raw(const void * ptr, size_t len) const;
    void write_u32(uint32_t val) const;
    size_t read_alignment() const;
    bool has_direct_io() const;
 private:
    struct impl;
    std::unique_ptr<impl> pimpl;
--- a/llama/llama.cpp/src/llama-model-loader.cpp
+++ b/llama/llama.cpp/src/llama-model-loader.cpp
@@ -2,7 +2,6 @@
 #include "ggml.h"
 #include <algorithm>
 #include <array>
 #include <cinttypes>
 #include <cstring>
@@ -345,7 +344,6 @@ namespace GGUFMeta {
            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid);
        switch (arr_info.gt) {
            case GGUF_TYPE_BOOL:
            case GGUF_TYPE_UINT32:
            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,     int32_t>::value) ||
                                                (std::is_same<T,    uint32_t>::value)); break;
@@ -367,13 +365,7 @@ namespace GGUFMeta {
                result[i] = value;
            }
        } else {
-            if (arr_info.gt == GGUF_TYPE_BOOL) {
+            std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
                std::transform((const bool *)arr_info.data, (const bool *)arr_info.data + arr_info.length, result.begin(), [](bool x) {
                    return static_cast<T>(x);
                });
            } else {
                std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
            }
        }
        return true;
@@ -470,29 +462,6 @@ namespace GGUFMeta {
        return get_key_or_arr(llm_kv(kid), result, n, required);
    }
    bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
        const std::string key = llm_kv(kid);
        const int id = gguf_find_key(meta.get(), key.c_str());
        if (id < 0) {
            if (required) {
                throw std::runtime_error(format("key not found in model: %s", key.c_str()));
            }
            return false;
        }
        // throw and error if type is an array
        if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) {
            if (required) {
                throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
            }
            return false;
        }
        return get_key(key, result, required);
    }
    // TODO: this is not very clever - figure out something better
    template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
    template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
@@ -503,7 +472,6 @@ llama_model_loader::llama_model_loader(
        const std::string & fname,
        std::vector<std::string> & splits,
        bool use_mmap,
        bool use_direct_io,
        bool check_tensors,
        bool no_alloc,
        const llama_model_kv_override * param_overrides_p,
@@ -536,23 +504,9 @@ llama_model_loader::llama_model_loader(
    get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
    llm_kv = LLM_KV(llm_arch_from_string(arch_name));
-    files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
+    files.emplace_back(new llama_file(fname.c_str(), "rb"));
    contexts.emplace_back(ctx);
    if (use_mmap && use_direct_io) {
        if (files.back()->has_direct_io()) {
            // Disable mmap, as DirectIO is available
            use_mmap = false;
            LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
        } else {
            // Disable DirectIO and reopen file using std::fopen for mmap
            use_direct_io = false;
            files.pop_back();
            files.emplace_back(new llama_file(fname.c_str(), "rb", false));
            LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
        }
    }
    // Save tensors data offset of the main file.
    // For subsidiary files, `meta` tensor data offset must not be used,
    // so we build a unified tensors index for weights.
@@ -618,7 +572,7 @@ llama_model_loader::llama_model_loader(
                }
            }
-            files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
+            files.emplace_back(new llama_file(fname_split, "rb"));
            contexts.emplace_back(ctx);
            // Save tensors data offset info of the shard.
@@ -762,7 +716,6 @@ llama_model_loader::llama_model_loader(
    }
    this->use_mmap = use_mmap;
    this->use_direct_io = use_direct_io;
    this->check_tensors = check_tensors;
    this->no_alloc = no_alloc;
 }
@@ -982,15 +935,7 @@ bool llama_model_loader::load_all_data(
    // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
    // NVMe raid configurations might require more / larger buffers.
    constexpr size_t n_buffers = 4;
-
+    constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
    size_t alignment = 1;
    for (const auto & file : files) {
        alignment = std::max(file->read_alignment(), alignment);
    }
    // Buffer size: balance between memory usage and I/O efficiency
    // 64MB works well for NVMe drives
    const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
    std::vector<ggml_backend_buffer_t> host_buffers;
    std::vector<ggml_backend_event_t> events;
@@ -1040,7 +985,6 @@ bool llama_model_loader::load_all_data(
        // If the backend is supported, create pinned memory buffers and events for synchronisation.
        for (size_t idx = 0; idx < n_buffers; ++idx) {
            auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
            if (!buf) {
                LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
                    ggml_backend_dev_name(dev));
@@ -1122,7 +1066,6 @@ bool llama_model_loader::load_all_data(
            }
        } else {
            const auto & file = files.at(weight->idx);
            if (ggml_backend_buffer_is_host(cur->buffer)) {
                file->seek(weight->offs, SEEK_SET);
                file->read_raw(cur->data, n_size);
@@ -1134,54 +1077,19 @@ bool llama_model_loader::load_all_data(
            } else {
                // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
                if (upload_backend) {
-                    size_t offset = weight->offs;
+                    file->seek(weight->offs, SEEK_SET);
                    alignment = file->read_alignment();
                    size_t aligned_offset = offset & ~(alignment - 1);
                    size_t offset_from_alignment = offset - aligned_offset;
                    file->seek(aligned_offset, SEEK_SET);
                    // Calculate aligned read boundaries
                    size_t read_start = aligned_offset;
                    size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
                    size_t bytes_read = 0;
                    size_t data_read = 0;  // Actual tensor data copied (excluding padding)
-                    while (bytes_read < read_end - read_start) {
+                    while (bytes_read < n_size) {
-                        size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
+                        size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
                        // Align the destination pointer within the pinned buffer
                        uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
                        // Wait for previous upload to complete before reusing buffer
                        ggml_backend_event_synchronize(events[buffer_idx]);
-
+                        file->read_raw(host_ptrs[buffer_idx], read_iteration);
-                        // Read aligned chunk from file
+                        ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
                        file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
                        // Calculate actual data portion (excluding alignment padding)
                        uintptr_t ptr_data = ptr_dest_aligned;
                        size_t data_to_copy = read_size;
                        // Skip alignment padding at start of first chunk
                        if (bytes_read == 0) {
                            ptr_data += offset_from_alignment;
                            data_to_copy -= offset_from_alignment;
                        }
                        // Trim alignment padding at end of last chunk
                        if (aligned_offset + bytes_read + read_size > offset + n_size) {
                            data_to_copy -= (read_end - (offset + n_size));
                        }
                        // Async upload actual data to GPU
                        ggml_backend_tensor_set_async(upload_backend, cur,
                                                      reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
                        ggml_backend_event_record(events[buffer_idx], upload_backend);
-                        data_read += data_to_copy;
+                        bytes_read += read_iteration;
                        bytes_read += read_size;
                        ++buffer_idx;
                        buffer_idx %= n_buffers;
                    }
--- a/llama/llama.cpp/src/llama-model-loader.h
+++ b/llama/llama.cpp/src/llama-model-loader.h
@@ -70,7 +70,6 @@ struct llama_model_loader {
    size_t   n_bytes    = 0;
    bool use_mmap = false;
    bool use_direct_io = false;
    bool check_tensors;
    bool no_alloc;
@@ -98,7 +97,6 @@ struct llama_model_loader {
        const std::string & fname,
        std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
        bool use_mmap,
        bool use_direct_io,
        bool check_tensors,
        bool no_alloc,
        const llama_model_kv_override * param_overrides_p,
@@ -133,8 +131,6 @@ struct llama_model_loader {
    template<typename T>
    bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true);
    bool get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required = true);
    std::string get_arch_name() const;
    enum llm_arch get_arch() const;
--- a/llama/llama.cpp/src/llama-model-saver.cpp
+++ b/llama/llama.cpp/src/llama-model-saver.cpp
@@ -146,9 +146,6 @@ void llama_model_saver::add_kv_from_model() {
    add_kv(LLM_KV_VOCAB_SIZE,                        vocab.n_tokens());
    add_kv(LLM_KV_CONTEXT_LENGTH,                    hparams.n_ctx_train);
    add_kv(LLM_KV_EMBEDDING_LENGTH,                  hparams.n_embd);
    if (hparams.n_embd_out_impl > 0) {
        add_kv(LLM_KV_EMBEDDING_LENGTH_OUT,          hparams.n_embd_out_impl);
    }
    add_kv(LLM_KV_BLOCK_COUNT,                       hparams.n_layer);
    add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
    add_kv(LLM_KV_FEED_FORWARD_LENGTH,               hparams.n_ff_arr, true);
--- a/llama/llama.cpp/src/llama-model.cpp
+++ b/llama/llama.cpp/src/llama-model.cpp
--- a/llama/llama.cpp/src/llama-model.h
+++ b/llama/llama.cpp/src/llama-model.h
@@ -11,7 +11,6 @@
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
 struct llama_cparams;
@@ -25,14 +24,12 @@ enum llm_type {
    LLM_TYPE_17M,
    LLM_TYPE_22M,
    LLM_TYPE_33M,
    LLM_TYPE_47M,
    LLM_TYPE_60M,
    LLM_TYPE_70M,
    LLM_TYPE_80M,
    LLM_TYPE_109M,
    LLM_TYPE_137M,
    LLM_TYPE_140M,
    LLM_TYPE_149M,
    LLM_TYPE_160M,
    LLM_TYPE_190M,
    LLM_TYPE_220M,
@@ -42,7 +39,6 @@ enum llm_type {
    LLM_TYPE_335M,
    LLM_TYPE_350M,
    LLM_TYPE_360M,
    LLM_TYPE_395M,
    LLM_TYPE_410M,
    LLM_TYPE_450M,
    LLM_TYPE_475M,
@@ -121,12 +117,10 @@ enum llm_type {
    LLM_TYPE_31B_A3_5B,
    LLM_TYPE_80B_A3B, // Qwen3 Next
    LLM_TYPE_100B_A6B,
    LLM_TYPE_102B_A12B, // Solar-Open
    LLM_TYPE_106B_A12B, // GLM-4.5-Air
    LLM_TYPE_230B_A10B, // Minimax M2
    LLM_TYPE_235B_A22B,
    LLM_TYPE_300B_A47B, // Ernie MoE big
    LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
    LLM_TYPE_355B_A32B, // GLM-4.5
    LLM_TYPE_E2B,
    LLM_TYPE_E4B,
@@ -471,6 +465,8 @@ struct llama_model {
    struct ggml_tensor * dense_2_out_layers = nullptr;
    struct ggml_tensor * dense_3_out_layers = nullptr;
    llama_model_params params;
    // gguf metadata
    std::unordered_map<std::string, std::string> gguf_kv;
@@ -480,9 +476,6 @@ struct llama_model {
    // for quantize-stats only
    std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
    // for keeping track of associated LoRA adapters
    std::unordered_set<llama_adapter_lora *> loras;
    int64_t t_load_us  = 0;
    int64_t t_start_us = 0;
@@ -504,9 +497,6 @@ struct llama_model {
    size_t n_tensors() const;
    size_t n_devices() const;
    uint32_t n_gpu_layers() const;
    llama_split_mode split_mode() const;
    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
    // total number of parameters in the model
@@ -535,8 +525,6 @@ struct llama_model {
    ggml_cgraph * build_graph(const llm_graph_params & params) const;
 private:
    llama_model_params params;
    struct impl;
    std::unique_ptr<impl> pimpl;
 };
--- a/llama/llama.cpp/src/llama-quant.cpp
+++ b/llama/llama.cpp/src/llama-quant.cpp
@@ -422,6 +422,57 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
        ++qs.i_ffn_up;
    }
    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
    //}
    // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
    //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
    //}
    // This can be used to reduce the size of the Q5_K_S model.
    // The associated PPL increase is fully in line with the size reduction
    //else {
    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
    //}
    bool convert_incompatible_tensor = false;
    {
        const int64_t nx = tensor->ne[0];
        const int64_t ny = tensor->ne[1];
        const int64_t qk_k = ggml_blck_size(new_type);
        if (nx % qk_k != 0) {
            LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
            convert_incompatible_tensor = true;
        } else {
            ++qs.n_k_quantized;
        }
    }
    if (convert_incompatible_tensor) {
        switch (new_type) {
            case GGML_TYPE_TQ1_0:
            case GGML_TYPE_TQ2_0:  new_type = GGML_TYPE_Q4_0; break;  // TODO: use a symmetric type instead
            case GGML_TYPE_IQ2_XXS:
            case GGML_TYPE_IQ2_XS:
            case GGML_TYPE_IQ2_S:
            case GGML_TYPE_IQ3_XXS:
            case GGML_TYPE_IQ3_S:
            case GGML_TYPE_IQ1_S:
            case GGML_TYPE_IQ1_M:
            case GGML_TYPE_Q2_K:
            case GGML_TYPE_Q3_K:
            case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
            case GGML_TYPE_Q4_K:   new_type = GGML_TYPE_Q5_0;   break;
            case GGML_TYPE_Q5_K:   new_type = GGML_TYPE_Q5_1;   break;
            case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
            default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
        }
        if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
            new_type = GGML_TYPE_F16;
        }
        LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
        ++qs.n_fallback;
    }
    return new_type;
 }
@@ -545,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    }
    std::vector<std::string> splits = {};
-    llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
+    llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
    ml.init_mappings(false); // no prefetching
    llama_model model(llama_model_default_params());
@@ -824,69 +875,21 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
            // get more optimal quantization type based on the tensor shape, layer, etc.
            if (!params->pure && ggml_is_quantized(default_type)) {
-                // if the user provided tensor types - use those
+                int fallback = qs.n_fallback;
-                bool manual = false;
+                new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
-                if (params->tensor_types) {
+                // unless the user specifies a type, and the tensor geometry will not require fallback quantisation
                if (params->tensor_types && qs.n_fallback - fallback == 0) {
                    const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
                    const std::string tensor_name(tensor->name);
                    for (const auto & [tname, qtype] : tensor_types) {
                        if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
                            if  (qtype != new_type) {
-                                LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype));
+                                LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
                                new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
                                manual = true;
                                break;
                            }
                        }
                    }
                }
                // if not manual - use the standard logic for choosing the quantization type based on the selected mixture
                if (!manual) {
                    new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
                }
                // incompatible tensor shapes are handled here - fallback to a compatible type
                {
                    bool convert_incompatible_tensor = false;
                    const int64_t nx = tensor->ne[0];
                    const int64_t ny = tensor->ne[1];
                    const int64_t qk_k = ggml_blck_size(new_type);
                    if (nx % qk_k != 0) {
                        LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
                        convert_incompatible_tensor = true;
                    } else {
                        ++qs.n_k_quantized;
                    }
                    if (convert_incompatible_tensor) {
                        switch (new_type) {
                            case GGML_TYPE_TQ1_0:
                            case GGML_TYPE_TQ2_0:  new_type = GGML_TYPE_Q4_0; break;  // TODO: use a symmetric type instead
                            case GGML_TYPE_IQ2_XXS:
                            case GGML_TYPE_IQ2_XS:
                            case GGML_TYPE_IQ2_S:
                            case GGML_TYPE_IQ3_XXS:
                            case GGML_TYPE_IQ3_S:
                            case GGML_TYPE_IQ1_S:
                            case GGML_TYPE_IQ1_M:
                            case GGML_TYPE_Q2_K:
                            case GGML_TYPE_Q3_K:
                            case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
                            case GGML_TYPE_Q4_K:   new_type = GGML_TYPE_Q5_0;   break;
                            case GGML_TYPE_Q5_K:   new_type = GGML_TYPE_Q5_1;   break;
                            case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
                            default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
                        }
                        if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
                            new_type = GGML_TYPE_F16;
                        }
                        LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
                        ++qs.n_fallback;
                    }
                }
            }
            if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
                new_type = params->token_embedding_type;
--- a/llama/llama.cpp/src/llama-sampling.cpp
+++ b/llama/llama.cpp/src/llama-sampling.cpp
--- a/llama/llama.cpp/src/llama-sampling.h
+++ b/llama/llama.cpp/src/llama-sampling.h
@@ -14,19 +14,7 @@ struct llama_grammar;
 struct llama_sampler_chain {
    llama_sampler_chain_params params;
-    // has .backend_init() been called?
+    std::vector<struct llama_sampler *> samplers;
    bool is_init = false;
    struct info {
        bool is_backend;
        llama_sampler * ptr;
    };
    std::vector<info> samplers;
    // pre-allocated buffer for llama_sampler_sample to avoid repeated allocations
    std::vector<llama_token_data> cur;
    // timing
@@ -36,9 +24,9 @@ struct llama_sampler_chain {
 };
 struct llama_sampler * llama_sampler_init_dry_testing(
-        int32_t context_size,
+                         int32_t   context_size,
-        float   dry_multiplier,
+                           float   dry_multiplier,
-        float   dry_base,
+                           float   dry_base,
-        int32_t dry_allowed_length,
+                         int32_t   dry_allowed_length,
-        int32_t dry_penalty_last_n,
+                         int32_t   dry_penalty_last_n,
-        const std::vector<std::vector<llama_token>> & seq_breakers);
+  const std::vector<std::vector<llama_token>>& seq_breakers);
--- a/llama/llama.cpp/src/llama-vocab.cpp
+++ b/llama/llama.cpp/src/llama-vocab.cpp
@@ -314,12 +314,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                    "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
                };
                break;
            case LLAMA_VOCAB_PRE_TYPE_YOUTU:
                regex_exprs = {
                    "[가-힣ㄱ-ㆎ]+|[！…“”‘’—：；，、-〿︰-﹏]+|[ㄅ-ㄯ]+|[一-龥぀-ゟ゠-ヿ]+",
                    "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                };
                break;
            case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
                regex_exprs = {
                    "[\r\n]",
@@ -361,7 +355,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
            case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
            case LLAMA_VOCAB_PRE_TYPE_QWEN2:
            case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
            case LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN:
                regex_exprs = {
                    // original regex from tokenizer.json
                    // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
@@ -461,13 +454,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                    "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                };
                break;
            case LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE:
                regex_exprs = {
                    // original regex from tokenizer.json
                    // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
                };
                break;
            default:
                // default regex for BPE tokenization pre-processing
                regex_exprs = {
@@ -1863,11 +1849,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    tokenizer_pre == "deepseek-v3") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
                clean_spaces = false;
            } else if (
                    tokenizer_pre == "youtu") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_YOUTU;
                clean_spaces = false;
                ignore_merges = true;
            } else if (
                    tokenizer_pre == "falcon") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
@@ -1886,8 +1867,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    tokenizer_pre == "jina-v2-es" ||
                    tokenizer_pre == "jina-v2-de" ||
                    tokenizer_pre == "a.x-4.0" ||
-                    tokenizer_pre == "mellum"  ||
+                    tokenizer_pre == "mellum") {
                    tokenizer_pre == "modern-bert" ) {
                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
            } else if (
                    tokenizer_pre == "jina-v1-en" ||
@@ -1961,9 +1941,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            } else if (
                tokenizer_pre == "exaone4") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
            } else if (
                tokenizer_pre == "exaone-moe") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE;
            } else if (
                tokenizer_pre == "chameleon") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
@@ -2026,10 +2003,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                tokenizer_pre == "minimax-m2") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
                clean_spaces = false;
            } else if (
                tokenizer_pre == "solar-open") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
                clean_spaces = false;
            } else {
                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
                pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
@@ -2076,7 +2049,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
        scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
    }
    const uint32_t n_scores = score_idx != -1 ? gguf_get_arr_n(ctx, score_idx) : 0;
    const int * toktypes = nullptr;
    const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
    if (toktype_idx != -1) {
@@ -2098,7 +2070,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
        auto & token_data = id_to_token[i];
        token_data.text  = std::move(word);
-        token_data.score = (scores && i < n_scores) ? scores[i] : 0.0f;
+        token_data.score = scores ? scores[i] : 0.0f;
        token_data.attr  = LLAMA_TOKEN_ATTR_NORMAL;
        if (toktypes) {  //TODO: remove, required until per token attributes are available from GGUF file
@@ -2204,8 +2176,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
        //       for now, we apply this workaround to find the tokens based on their text
        for (const auto & t : token_to_id) {
            auto & attr = id_to_token[t.second].attr;
            // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
            if (special_eot_id == LLAMA_TOKEN_NULL) {
                if (false
@@ -2221,10 +2191,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<end_of_utterance>" // smoldocling
                   ) {
                    special_eot_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2235,10 +2205,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|eom_id|>"
                        ) {
                    special_eom_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2255,10 +2225,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|code_prefix|>" // GLM-4.5
                        ) {
                    special_fim_pre_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2275,10 +2245,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|code_suffix|>" // GLM-4.5
                        ) {
                    special_fim_suf_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2295,10 +2265,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|code_middle|>" // GLM-4.5
                        ) {
                    special_fim_mid_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2312,10 +2282,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<PAD>"
                        ) {
                    special_fim_pad_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2330,10 +2300,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<reponame>"    // Granite
                        ) {
                    special_fim_rep_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2344,41 +2314,15 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|file_sep|>" // Qwen
                        ) {
                    special_fim_sep_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
        }
        // auto-detect unused tokens: e.g. control tokens with the word "unused"
        // ideally, these tokens should be marked as unused during conversion
        {
            uint32_t n_unused = 0;
            for (const auto & t : token_to_id) {
                auto & attr = id_to_token[t.second].attr;
                if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                    continue;
                }
                if ((attr & LLAMA_TOKEN_ATTR_UNUSED) == 0) {
                    if (strstr(t.first.c_str(), "unused") != NULL) {
                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_UNUSED);
                    }
                }
                if (attr & LLAMA_TOKEN_ATTR_UNUSED) {
                    n_unused++;
                }
            }
            LLAMA_LOG_INFO("%s: %u unused tokens\n", __func__, n_unused);
        }
        // maintain a list of tokens that cause end-of-generation
        // this is currently determined based on the token text, which is obviously not ideal
        // ref: https://github.com/ggerganov/llama.cpp/issues/9606
@@ -2397,16 +2341,12 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
        }
        for (const auto & t : token_to_id) {
            auto & attr = id_to_token[t.second].attr;
            if (false
                    || t.first == "<|eot_id|>"
                    || t.first == "<|im_end|>"
                    || t.first == "<|end|>"
                    || t.first == "<|return|>" // o200k_harmony
                    || t.first == "<|call|>"   // o200k_harmony
                    || t.first == "<|flush|>"  // solar-open
                    || t.first == "<|calls|>"  // solar-open
                    || t.first == "<end_of_turn>"
                    || t.first == "<|endoftext|>"
                    || t.first == "<|eom_id|>"
@@ -2416,31 +2356,24 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    || t.first == "<end_of_utterance>" // smoldocling
               ) {
                special_eog_ids.insert(t.second);
-                if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                    LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                            __func__, t.second, t.first.c_str());
-                    attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                    id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                }
            } else {
-                if (attr & LLAMA_TOKEN_ATTR_CONTROL && !(attr & LLAMA_TOKEN_ATTR_UNUSED)) {
+                // token is control, but not marked as EOG -> print a debug log
-                    // token is control, but not marked as EOG -> print a debug log
+                if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) {
-                    if (special_eog_ids.count(t.second) == 0) {
+                    LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
-                        LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
+                            __func__, t.second, t.first.c_str());
                                __func__, t.second, t.first.c_str());
                    }
                }
            }
        }
        // @ngxson : quick hack for gpt-oss, always render these tokens
        for (const auto & t : token_to_id) {
            auto & attr = id_to_token[t.second].attr;
            if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
-                LLAMA_LOG_WARN("%s: setting token '%s' (%d) attribute to USER_DEFINED (%u), old attributes: %u\n",
+                id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
                        __func__, t.first.c_str(), t.second, LLAMA_TOKEN_ATTR_USER_DEFINED, attr);
                attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
            }
        }
@@ -2460,42 +2393,34 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
        }
-        // TODO: workaround for o200k_harmony and solar-open tokenizer: the "<|end|>" token should not be EOG
+        // TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG
-        //       we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens ("<|calls|>" and "<|flush|>" for solar-open),
+        //       we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens,
        //       we remove the "<|end|>" token from the EOG list
        {
            bool has_return = false;
            bool has_call   = false;
            bool has_end    = false;
            bool has_flush  = false;
            llama_token end_id = LLAMA_TOKEN_NULL;
            LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
            for (auto tid : special_eog_ids) {
-                auto & text = id_to_token[tid].text;
+                LLAMA_LOG_INFO("%s:   - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str());
-                LLAMA_LOG_INFO("%s:   - %d ('%s')\n", __func__, tid, text.c_str());
+                if (id_to_token[tid].text == "<|return|>") {
                if (text == "<|return|>") {
                    has_return = true;
-                } else if (text == "<|call|>" || text == "<|calls|>") {
+                } else if (id_to_token[tid].text == "<|call|>") {
                    has_call = true;
-                } else if (text == "<|flush|>") {
+                } else if (id_to_token[tid].text == "<|end|>") {
                    has_flush = true;
                } else if (text == "<|end|>") {
                    has_end = true;
                    end_id = tid;
                }
            }
-            if ((has_return && has_call && has_end) || (has_call && has_flush && has_end)) {
+            if (has_return && has_call && has_end) {
                special_eog_ids.erase(end_id);
-
+                id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
-                auto & attr = id_to_token[end_id].attr;
+                LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
                attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
                LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
            }
        }
    }
@@ -2593,13 +2518,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
                _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
            }
        } else if (_contains_any(model_name, {"modern-bert"})) {
            if (token_to_id.count("[MASK]") == 0 ) {
                LLAMA_LOG_WARN("%s: Mask token missing in vocab!\n", __func__);
            }
            else {
                _set_token_attr("[MASK]", LLAMA_TOKEN_ATTR_LSTRIP, true);
            }
        }
    }
 }
@@ -3293,34 +3211,34 @@ int32_t llama_vocab::impl::detokenize(
 }
 void llama_vocab::impl::print_info() const {
-    LLAMA_LOG_INFO("%s: vocab type            = %s\n",     __func__, type_name().c_str());
+    LLAMA_LOG_INFO("%s: vocab type       = %s\n",     __func__, type_name().c_str());
-    LLAMA_LOG_INFO("%s: n_vocab               = %u\n",     __func__, vocab.n_tokens());
+    LLAMA_LOG_INFO("%s: n_vocab          = %u\n",     __func__, vocab.n_tokens());
-    LLAMA_LOG_INFO("%s: n_merges              = %u\n",     __func__, (uint32_t) bpe_ranks.size());
+    LLAMA_LOG_INFO("%s: n_merges         = %u\n",     __func__, (uint32_t) bpe_ranks.size());
    // special tokens
-    if (special_bos_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: BOS token             = %d '%s'\n", __func__, special_bos_id,     id_to_token.at(special_bos_id).text.c_str() );  }
+    if (special_bos_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, special_bos_id,     id_to_token.at(special_bos_id).text.c_str() );  }
-    if (special_eos_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOS token             = %d '%s'\n", __func__, special_eos_id,     id_to_token.at(special_eos_id).text.c_str() );  }
+    if (special_eos_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, special_eos_id,     id_to_token.at(special_eos_id).text.c_str() );  }
-    if (special_eot_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOT token             = %d '%s'\n", __func__, special_eot_id,     id_to_token.at(special_eot_id).text.c_str() );  }
+    if (special_eot_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOT token        = %d '%s'\n", __func__, special_eot_id,     id_to_token.at(special_eot_id).text.c_str() );  }
-    if (special_eom_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOM token             = %d '%s'\n", __func__, special_eom_id,     id_to_token.at(special_eom_id).text.c_str() );  }
+    if (special_eom_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOM token        = %d '%s'\n", __func__, special_eom_id,     id_to_token.at(special_eom_id).text.c_str() );  }
-    if (special_unk_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: UNK token             = %d '%s'\n", __func__, special_unk_id,     id_to_token.at(special_unk_id).text.c_str() );  }
+    if (special_unk_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, special_unk_id,     id_to_token.at(special_unk_id).text.c_str() );  }
-    if (special_sep_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: SEP token             = %d '%s'\n", __func__, special_sep_id,     id_to_token.at(special_sep_id).text.c_str() );  }
+    if (special_sep_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, special_sep_id,     id_to_token.at(special_sep_id).text.c_str() );  }
-    if (special_pad_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: PAD token             = %d '%s'\n", __func__, special_pad_id,     id_to_token.at(special_pad_id).text.c_str() );  }
+    if (special_pad_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, special_pad_id,     id_to_token.at(special_pad_id).text.c_str() );  }
-    if (special_mask_id != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: MASK token            = %d '%s'\n", __func__, special_mask_id,    id_to_token.at(special_mask_id).text.c_str() ); }
+    if (special_mask_id != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: MASK token       = %d '%s'\n", __func__, special_mask_id,    id_to_token.at(special_mask_id).text.c_str() ); }
-    if (linefeed_id != LLAMA_TOKEN_NULL)        { LLAMA_LOG_INFO( "%s: LF token              = %d '%s'\n", __func__, linefeed_id,        id_to_token.at(linefeed_id).text.c_str() ); }
+    if (linefeed_id != LLAMA_TOKEN_NULL)        { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, linefeed_id,        id_to_token.at(linefeed_id).text.c_str() ); }
-    if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token         = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
+    if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token    = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
-    if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token         = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
+    if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token    = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
-    if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token         = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
+    if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token    = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
-    if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token         = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
+    if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token    = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
-    if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token         = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
+    if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token    = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
-    if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token         = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
+    if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token    = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
    for (const auto & id : special_eog_ids) {
-        LLAMA_LOG_INFO( "%s: EOG token             = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
+        LLAMA_LOG_INFO( "%s: EOG token        = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
    }
-    LLAMA_LOG_INFO("%s: max token length      = %d\n", __func__, max_token_len);
+    LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
 }
 llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
--- a/llama/llama.cpp/src/llama-vocab.h
+++ b/llama/llama.cpp/src/llama-vocab.h
@@ -51,9 +51,6 @@ enum llama_vocab_pre_type {
    LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
    LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2      = 41,
    LLAMA_VOCAB_PRE_TYPE_AFMOE           = 42,
    LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN      = 43,
    LLAMA_VOCAB_PRE_TYPE_YOUTU           = 44,
    LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE      = 45,
 };
 struct LLM_KV;
--- a/llama/llama.cpp/src/llama.cpp
+++ b/llama/llama.cpp/src/llama.cpp
@@ -71,9 +71,8 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
    }, &ud);
    llama_model_params mparams_copy = *mparams;
-    mparams_copy.no_alloc  = true;
+    mparams_copy.no_alloc = true;
-    mparams_copy.use_mmap  = false;
+    mparams_copy.use_mmap = false;
    mparams_copy.use_mlock = false;
    llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
    if (model == nullptr) {
@@ -111,20 +110,8 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
        }
    }
    for (size_t i = 0; i < ret.size(); i++) {
-        size_t free;
+        size_t free, total;
        size_t total;
        ggml_backend_dev_memory(model->devices[i], &free, &total);
        // devices can return 0 bytes for free and total memory if they do not
        // have any to report. in this case, we will use the host memory as a fallback
        // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
        if (free == 0 && total == 0) {
            ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
            if (cpu_dev == nullptr) {
                throw std::runtime_error(format("%s: no CPU backend found", __func__));
            }
            ggml_backend_dev_memory(cpu_dev, &free, &total);
        }
        ret[i].free  = free;
        ret[i].total = total;
    }
@@ -152,15 +139,12 @@ enum layer_fraction_t {
 };
 // this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
 class llama_params_fit_exception : public std::runtime_error {
    using std::runtime_error::runtime_error;
 };
 static void llama_params_fit_impl(
        const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
        float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
-        size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
+        size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
    constexpr int64_t MiB = 1024*1024;
    const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
    typedef std::vector<llama_device_memory_data> dmds_t;
    const llama_model_params default_mparams = llama_model_default_params();
@@ -179,12 +163,6 @@ static void llama_params_fit_impl(
        return;
    }
    std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
    margins.reserve(nd);
    for (size_t id = 0; id < nd; id++) {
        margins.push_back(margins_s[id]);
    }
    std::vector<std::string> dev_names;
    {
        dev_names.reserve(nd);
@@ -202,12 +180,11 @@ static void llama_params_fit_impl(
        }
    }
-    int64_t sum_free            = 0;
+    int64_t sum_total          = 0;
-    int64_t sum_projected_free  = 0;
+    int64_t sum_projected_free = 0;
-    int64_t sum_projected_used  = 0;
+    int64_t min_projected_free = INT64_MAX;
-    int64_t sum_projected_model = 0;
+    int64_t sum_projected_used = 0;
-    std::vector<int64_t> projected_free_per_device;
+    int64_t sum_projected_ctx  = 0;
    projected_free_per_device.reserve(nd);
    if (nd > 1) {
        LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
@@ -217,106 +194,63 @@ static void llama_params_fit_impl(
        const int64_t projected_used = dmd.mb.total();
        const int64_t projected_free = dmd.free - projected_used;
        projected_free_per_device.push_back(projected_free);
-        sum_free            += dmd.free;
+        sum_total          += dmd.total;
-        sum_projected_used  += projected_used;
+        sum_projected_used += projected_used;
-        sum_projected_free  += projected_free;
+        sum_projected_free += projected_free;
-        sum_projected_model += dmd.mb.model;
+        min_projected_free  = std::min(min_projected_free, projected_free);
        sum_projected_ctx  += dmd.mb.context;
        if (nd > 1) {
-            LLAMA_LOG_INFO("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
+            LLAMA_LOG_INFO("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
-                __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
+                __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB,
                projected_free >= 0 ? "surplus" : "deficit");
        }
    }
-    assert(sum_free >= 0 && sum_projected_used >= 0);
+    assert(sum_total >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0);
    assert(sum_projected_used >= sum_projected_ctx);
    LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
-        __func__, sum_projected_used/MiB, sum_free/MiB);
+        __func__, sum_projected_used/MiB, sum_total/MiB);
-    if (nd == 1) {
+    if (min_projected_free >= margin) {
-        if (projected_free_per_device[0] >= margins[0]) {
+        if (nd == 1) {
            LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
-                __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
+                __func__, min_projected_free/MiB, margin/MiB);
            return;
        }
    } else {
        bool changes_needed = false;
        for (size_t id = 0; id < nd; id++) {
            if (projected_free_per_device[id] < margins[id]) {
                changes_needed = true;
                break;
            }
        }
        if (!changes_needed) {
            LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
            return;
        }
        LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n",
            __func__, min_projected_free/MiB, margin/MiB);
        return;
    }
    // step 2: try reducing memory use by reducing the context size
    {
-        int64_t global_surplus = sum_projected_free;
+        int64_t global_surplus = sum_projected_free - int64_t(nd)*margin;
        for (size_t id = 0; id < nd; id++) {
            global_surplus -= margins[id];
        }
        if (global_surplus < 0) {
-            if (nd == 1) {
+            LLAMA_LOG_INFO(nd == 1 ?
-                LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
+                "%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n" :
-                    __func__, margins[0]/MiB, -global_surplus/MiB);
+                "%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n",
-            } else {
+                __func__, margin/MiB, -global_surplus/MiB);
                LLAMA_LOG_INFO(
                    "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
                    __func__, -global_surplus/MiB);
            }
            if (cparams->n_ctx == 0) {
                if (hp_nct > n_ctx_min) {
-                    int64_t sum_used_target = sum_free;
+                    const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct;
-                    for (size_t id = 0; id < nd; id++) {
+                    const uint32_t ctx_reduction = std::min(
-                        sum_used_target -= margins[id];
+                        uint32_t((-global_surplus + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min);
-                    }
+                    cparams->n_ctx = hp_nct - ctx_reduction;
-                    if (nd > 1) {
+                    const int64_t memory_reduction = ctx_reduction * bytes_per_ctx;
-                        // for multiple devices we need to be more conservative in terms of how much context we think can fit:
+                    global_surplus += memory_reduction;
-                        //   - for dense models only whole layers can be assigned to devices
+                    LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
-                        //   - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
+                        __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
-                        //   - on average we expect a waste of 0.5 layers/tensors per device
+                    if (global_surplus >= 0) {
                        //   - use slightly more than the expected average for nd devices to be safe
                        const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
                        sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
                    }
                    int64_t sum_projected_used_min_ctx = 0;
                    cparams->n_ctx = n_ctx_min;
                    const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
                    for (const auto & dmd : dmds_min_ctx) {
                        sum_projected_used_min_ctx += dmd.mb.total();
                    }
                    if (sum_used_target > sum_projected_used_min_ctx) {
                        // linear interpolation between minimum and maximum context size:
                        cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
                            / (sum_projected_used - sum_projected_used_min_ctx);
                        cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
                        const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
                        const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
                        LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
                        if (nd == 1) {
                            LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
                            return;
                        }
                        LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
                    } else {
                        const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
                        LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
                    }
                } else {
-                    if (n_ctx_min == UINT32_MAX) {
+                    LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
-                        LLAMA_LOG_INFO("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
+                        __func__, hp_nct, n_ctx_min);
                    } else {
                        LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
                            __func__, hp_nct, n_ctx_min);
                    }
                }
            } else {
                LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
@@ -325,28 +259,32 @@ static void llama_params_fit_impl(
    }
    if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
-        throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
+        throw std::runtime_error("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
    }
    if (nd > 1) {
        if (!tensor_split) {
-            throw llama_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
+            throw std::runtime_error("did not provide a buffer to write the tensor_split to, abort");
        }
        if (mparams->tensor_split) {
            for (size_t id = 0; id < nd; id++) {
                if (mparams->tensor_split[id] != 0.0f) {
-                    throw llama_params_fit_exception("model_params::tensor_split already set by user, abort");
+                    throw std::runtime_error("model_params::tensor_split already set by user, abort");
                }
            }
        }
        if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
-            throw llama_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
+            throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
        }
        if (hp_ngl < 2*nd) {
            throw std::runtime_error("model has only " + std::to_string(hp_ngl) + " layers but need at least "
                + std::to_string(2*nd) + " to fit memory for " + std::to_string(nd) + " devices, abort");
        }
    }
    if (!tensor_buft_overrides) {
-        throw llama_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
+        throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
    }
    if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
-        throw llama_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
+        throw std::runtime_error("model_params::tensor_buft_overrides already set by user, abort");
    }
    // step 3: iteratively fill the back to front with "dense" layers
@@ -399,11 +337,6 @@ static void llama_params_fit_impl(
        // for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
        layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
        uint32_t n_full() const {
            assert(n_layer >= n_part);
            return n_layer - n_part;
        }
    };
    const size_t ntbo = llama_max_tensor_buft_overrides();
@@ -412,7 +345,8 @@ static void llama_params_fit_impl(
    auto set_ngl_tensor_split_tbo = [&](
            const std::vector<ngl_t> & ngl_per_device,
            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
-            llama_model_params & mparams) {
+            llama_model_params & mparams,
            const bool add_nonrepeating) {
        mparams.n_gpu_layers = 0;
        for (size_t id = 0; id < nd; id++) {
            mparams.n_gpu_layers += ngl_per_device[id].n_layer;
@@ -420,25 +354,29 @@ static void llama_params_fit_impl(
                tensor_split[id] = ngl_per_device[id].n_layer;
            }
        }
-        assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
+        assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl);
-        uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
+        uint32_t il0 = hp_ngl - mparams.n_gpu_layers; // start index for tensor buft overrides
        if (add_nonrepeating) {
            mparams.n_gpu_layers += 1;
            tensor_split[nd - 1] += 1;
        }
        mparams.tensor_split = tensor_split;
        size_t itbo = 0;
        for (size_t id = 0; id < nd; id++) {
-            il0 += ngl_per_device[id].n_full();
+            il0 += ngl_per_device[id].n_layer - ngl_per_device[id].n_part;
            for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
                if (itbo + 1 >= ntbo) {
                    tensor_buft_overrides[itbo].pattern = nullptr;
                    tensor_buft_overrides[itbo].buft    = nullptr;
                    itbo++;
                    mparams.tensor_buft_overrides = tensor_buft_overrides;
-                    throw llama_params_fit_exception("llama_max_tensor_buft_overrides() == "
+                    throw std::runtime_error("llama_params_fit_n_tensor_buft_overrides() == "
-                        + std::to_string(ntbo) + " is insufficient for model");
+                        + std::to_string(ntbo) + " is insufficient for model\n");
                }
                tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
-                tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
+                tensor_buft_overrides[itbo].buft = overflow_bufts[id];
                itbo++;
            }
            il0 += ngl_per_device[id].n_part;
@@ -453,9 +391,10 @@ static void llama_params_fit_impl(
    auto get_memory_for_layers = [&](
            const char * func_name,
            const std::vector<ngl_t> & ngl_per_device,
-            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
+            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
            const bool add_nonrepeating) -> std::vector<int64_t> {
        llama_model_params mparams_copy = *mparams;
-        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
+        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy, add_nonrepeating);
        const dmds_t dmd_nl = llama_get_device_memory_data(
            path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
@@ -488,9 +427,9 @@ static void llama_params_fit_impl(
        const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
            path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
-        for (size_t id = 0; id < nd; id++) {
+        for (const llama_device_memory_data & dmd : dmds_cpu_moe) {
-            global_surplus_cpu_moe += dmds_cpu_moe[id].free;
+            global_surplus_cpu_moe += dmd.free;
-            global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
+            global_surplus_cpu_moe -= int64_t(dmd.mb.total()) + margin;
        }
        if (global_surplus_cpu_moe > 0) {
@@ -509,18 +448,27 @@ static void llama_params_fit_impl(
    std::vector<int64_t> targets; // maximum acceptable memory use per device
    targets.reserve(nd);
    for (size_t id = 0; id < nd; id++) {
-        targets.push_back(dmds_full[id].free - margins[id]);
+        targets.push_back(dmds_full[id].free - margin);
        LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
    }
-    std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
+    // whether for the optimal memory use we expect to load at least some MoE tensors:
    const bool partial_moe = hp_nex > 0 && global_surplus_cpu_moe > 0;
    std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
    overflow_bufts.reserve(nd);
-    for (size_t id = 0; id < nd; id++) {
+    for (size_t id = 0; id < nd - 1; ++id) {
-        overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
+        overflow_bufts.push_back(ggml_backend_dev_buffer_type(devs[id + 1]));
    }
    overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
    std::vector<ngl_t> ngl_per_device(nd);
-    std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
+    std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts, partial_moe);
    if (hp_nex > 0) {
        for (size_t id = 0; id < nd; id++) {
            ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
        }
    }
    // optimize the number of layers per device using the method of false position:
    //   - ngl_per_device has 0 layers for each device, lower bound
@@ -528,30 +476,22 @@ static void llama_params_fit_impl(
    //   - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
    //   - check memory use of our guess, replace either the low or high bound
    //   - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
    //   - the last device has the output layer, which cannot be a partial layer
    if (hp_nex == 0) {
        LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
    } else {
        LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
    }
    uint32_t n_unassigned = hp_ngl;
    for (int id = nd - 1; id >= 0; id--) {
        uint32_t n_unassigned = hp_ngl + 1;
        for (size_t jd = id + 1; jd < nd; ++jd) {
            assert(n_unassigned >= ngl_per_device[jd].n_layer);
            n_unassigned -= ngl_per_device[jd].n_layer;
        }
        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
        ngl_per_device_high[id].n_layer = n_unassigned;
        if (hp_nex > 0) {
-            ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
+            ngl_per_device_high[id].n_part = ngl_per_device_high[id].n_layer;
        }
        if (ngl_per_device_high[id].n_layer > 0) {
-            std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
+            std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
            if (mem_high[id] > targets[id]) {
                assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
                uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
                LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
                while (delta > 1) {
                    uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
                    step_size = std::max(step_size, uint32_t(1));
@@ -560,26 +500,25 @@ static void llama_params_fit_impl(
                    std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
                    ngl_per_device_test[id].n_layer += step_size;
                    if (hp_nex) {
-                        ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
+                        ngl_per_device_test[id].n_part += step_size;
                            step_size - 1 : step_size; // the first layer is the output layer which must always be full
                    }
-                    const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
+                    const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
                    if (mem_test[id] <= targets[id]) {
-                        ngl_per_device = ngl_per_device_test;
+                        ngl_per_device  = ngl_per_device_test;
-                        mem            = mem_test;
+                        mem             = mem_test;
                        n_unassigned   -= ngl_per_device[id].n_layer;
                        LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
                    } else {
                        ngl_per_device_high = ngl_per_device_test;
                        mem_high            = mem_test;
-                        LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
+                        LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
                    }
                    delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
                }
            } else {
-                assert(ngl_per_device_high[id].n_layer == n_unassigned);
+                ngl_per_device  = ngl_per_device_high;
-                ngl_per_device = ngl_per_device_high;
+                n_unassigned   -= ngl_per_device[id].n_layer;
                mem            = mem_high;
                LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
            }
        }
@@ -590,7 +529,7 @@ static void llama_params_fit_impl(
            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
    }
    if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
-        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
+        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
        return;
    }
@@ -610,20 +549,24 @@ static void llama_params_fit_impl(
    assert(id_dense_start < nd);
    LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
-    for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
+    for (size_t id = 0; id <= id_dense_start; id++) {
        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
        for (size_t jd = id_dense_start; jd < nd; jd++) {
-            const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
+            const uint32_t n_layer_move = ngl_per_device_high[jd].n_layer;
            ngl_per_device_high[id].n_layer += n_layer_move;
            ngl_per_device_high[jd].n_layer -= n_layer_move;
            ngl_per_device_high[jd].n_part = 0;
        }
        size_t id_dense_start_high = nd - 1;
-        std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
+        std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
        if (mem_high[id] > targets[id]) {
-            assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
+            assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
-            uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
+            assert(ngl_per_device[id].n_layer >= ngl_per_device[id].n_part);
            assert((ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
                   >= ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
            uint32_t delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
                - (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
            while (delta > 1) {
                uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
                step_size = std::max(step_size, uint32_t(1));
@@ -639,11 +582,11 @@ static void llama_params_fit_impl(
                    ngl_per_device_test[id].n_layer += n_convert_jd;
                    n_converted_test += n_convert_jd;
-                    if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
+                    if (ngl_per_device_test[id_dense_start_test].n_layer > 0) {
                        break;
                    }
                }
-                const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
+                const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
                if (mem_test[id] <= targets[id]) {
                    ngl_per_device = ngl_per_device_test;
@@ -658,38 +601,32 @@ static void llama_params_fit_impl(
                    LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
                        __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
                }
-                assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
+                delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
-                delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
+                    - (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
            }
        } else {
            ngl_per_device = ngl_per_device_high;
            mem            = mem_high;
            id_dense_start = id_dense_start_high;
            LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
                __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
        }
        // try to fit at least part of one more layer
-        if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
+        if (ngl_per_device[id_dense_start].n_layer > 0) {
            std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
            size_t id_dense_start_test = id_dense_start;
            ngl_per_device_test[id_dense_start_test].n_layer--;
            ngl_per_device_test[id_dense_start_test].n_part--;
            ngl_per_device_test[id].n_layer++;
            ngl_per_device_test[id].n_part++;
-            if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
+            if (ngl_per_device_test[id_dense_start_test].n_layer == 0) {
                id_dense_start_test++;
            }
            ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
            std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
            if (id < nd - 1) {
                overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
            }
            LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
-            std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
+            std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
-            if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
+            if (mem_test[id] < targets[id]) {
                ngl_per_device = ngl_per_device_test;
                overflow_bufts = overflow_bufts_test;
                mem            = mem_test;
                id_dense_start = id_dense_start_test;
                LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
@@ -697,10 +634,9 @@ static void llama_params_fit_impl(
                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
                LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
-                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
+                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
-                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
+                if (mem_test[id] < targets[id]) {
                    ngl_per_device = ngl_per_device_test;
                    overflow_bufts = overflow_bufts_test;
                    mem            = mem_test;
                    id_dense_start = id_dense_start_test;
                    LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
@@ -709,10 +645,9 @@ static void llama_params_fit_impl(
            } else {
                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
                LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
-                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
+                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
-                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
+                if (mem_test[id] < targets[id]) {
                    ngl_per_device = ngl_per_device_test;
                    overflow_bufts = overflow_bufts_test;
                    mem            = mem_test;
                    id_dense_start = id_dense_start_test;
                    LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
@@ -727,41 +662,30 @@ static void llama_params_fit_impl(
            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
    }
-    // print info for devices that were not changed during the conversion from dense only to full layers:
+    set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
    for (size_t id = id_dense_start + 1; id < nd; id++) {
        const int64_t projected_margin = dmds_full[id].free - mem[id];
        LLAMA_LOG_INFO(
            "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
    }
    set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
 }
-enum llama_params_fit_status llama_params_fit(
+bool llama_params_fit(
        const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
        float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
-        size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) {
+        size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
    const int64_t t0_us = llama_time_us();
-    llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
+    bool ok = true;
    try {
-        llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
+        llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
        LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
    } catch (const llama_params_fit_exception & e) {
        LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
        status = LLAMA_PARAMS_FIT_STATUS_FAILURE;
    } catch (const std::runtime_error & e) {
-        LLAMA_LOG_ERROR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
+        LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
-        status = LLAMA_PARAMS_FIT_STATUS_ERROR;
+        ok = false;
    }
    const int64_t t1_us = llama_time_us();
    LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
-    return status;
+    return ok;
 }
 struct llama_sampler_chain_params llama_sampler_chain_default_params() {
    struct llama_sampler_chain_params result = {
-        /*.no_perf =*/ true,
+        /*.no_perf                     =*/ true,
    };
    return result;
@@ -834,7 +758,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
    model.t_start_us = tm.t_start_us;
    try {
-        llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
+        llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
        ml.print_info();
@@ -1097,55 +1021,25 @@ int32_t llama_chat_apply_template(
 // model split
 //
-int32_t llama_split_path(
+int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
    char * split_path,
    size_t maxlen,
    const char * path_prefix,
    int32_t split_no,
    int32_t split_count) {
    static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
-
+    if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
-    const int written = snprintf(
+        return strlen(split_path);
        split_path,
        maxlen,
        SPLIT_PATH_FORMAT,
        path_prefix,
        split_no + 1,
        split_count
    );
    if (written < 0 || (size_t) written >= maxlen) {
        return 0;
    }
-
+    return 0;
    return (int32_t) written;
 }
-int32_t llama_split_prefix(
+int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) {
-    char * split_prefix,
+    std::string str_split_path(split_path);
    size_t maxlen,
    const char * split_path,
    int32_t split_no,
    int32_t split_count) {
    const std::string str_split_path(split_path);
    char postfix[32];
-    snprintf(postfix, sizeof(postfix), "-%05d-of-%05d.gguf", split_no + 1, split_count);
+    snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
    std::string str_postfix(postfix);
-    const std::string str_postfix(postfix);
+    // check if split_prefix ends with postfix
-    if (str_split_path.size() <= str_postfix.size()) {
+    int size_prefix = str_split_path.size() - str_postfix.size();
-        return 0;
+    if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
-    }
+        snprintf(split_prefix, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
-
+        return size_prefix;
    const size_t size_prefix = str_split_path.size() - str_postfix.size();
    if (str_split_path.compare(size_prefix, std::string::npos, str_postfix) == 0) {
        const size_t copy_len = std::min(size_prefix + 1, maxlen);
        snprintf(split_prefix, copy_len, "%s", split_path);
        return (int32_t) size_prefix;
    }
    return 0;
--- a/llama/llama.cpp/src/models/afmoe.cpp
+++ b/llama/llama.cpp/src/models/afmoe.cpp
@@ -22,15 +22,8 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
    const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
    for (int il = 0; il < n_layer; ++il) {
        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
        ggml_tensor * inpSA = inpL;
        // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
        const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
                              (il + 1) % hparams.n_no_rope_layer_step != 0;
        // dual attention normalization (pre)
        cur = build_norm(inpL,
                model.layers[il].attn_norm, NULL,
@@ -63,16 +56,19 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
            cb(Qcur, "Qcur_normed", il);
            cb(Kcur, "Kcur_normed", il);
            // RoPE only for sliding_attention layers
            const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
                                ((il + 1) % hparams.n_no_rope_layer_step) != 0;
            if (use_rope) {
                Qcur = ggml_rope_ext(
                        ctx0, Qcur, inp_pos, nullptr,
-                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow);
                cb(Qcur, "Qcur_rope", il);
                Kcur = ggml_rope_ext(
                        ctx0, Kcur, inp_pos, nullptr,
-                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow);
                cb(Kcur, "Kcur_rope", il);
            }
--- a/llama/llama.cpp/src/models/bert.cpp
+++ b/llama/llama.cpp/src/models/bert.cpp
@@ -142,13 +142,11 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
                    LLM_FFN_GELU, LLM_FFN_SEQ, il);
            cb(cur, "ffn_out", il);
        } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
            const bool up_contains_gate = !model.layers[il].ffn_gate && model.layers[il].ffn_up->ne[1] != hparams.n_ff();
            auto type_op = up_contains_gate ? LLM_FFN_GEGLU : LLM_FFN_GELU;
            cur = build_ffn(cur,
-                    model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+                    model.layers[il].ffn_up, NULL, NULL,
                    model.layers[il].ffn_gate, NULL, NULL,
                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
-                    type_op, LLM_FFN_PAR, il);
+                    model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
            cb(cur, "ffn_out", il);
        } else {
            cur = build_ffn(cur,
--- a/llama/llama.cpp/src/models/cogvlm.cpp
+++ b/llama/llama.cpp/src/models/cogvlm.cpp
@@ -3,14 +3,12 @@
 llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
    const int64_t n_embd_head = hparams.n_embd_head_v;
-    const float   kq_scale    = 1.0f / sqrtf(float(n_embd_head));
+    float         kq_scale    = 1.0f / sqrtf(float(n_embd_head));
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
    GGML_ASSERT(n_embd_head == hparams.n_rot);
-    ggml_tensor * inpL;
+    ggml_tensor *inpL, *cur;
    ggml_tensor * cur;
    inpL = build_inp_embd(model.tok_embd);
    ggml_tensor * inp_pos = build_inp_pos();
@@ -46,7 +44,7 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa
        }
        ggml_tensor * inpSA = inpL;
-        cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+        cur                 = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
        // build self attention
        {
--- a/llama/llama.cpp/src/models/cohere2-iswa.cpp
+++ b/llama/llama.cpp/src/models/cohere2-iswa.cpp
@@ -21,9 +21,6 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const
    for (int il = 0; il < n_layer; ++il) {
        const bool is_swa = hparams.is_swa(il);
        // UNUSED:
        // const float freq_base_l  = model.get_rope_freq_base (cparams, il);
        // const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
        // norm
        cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
--- a/llama/llama.cpp/src/models/deepseek2.cpp
+++ b/llama/llama.cpp/src/models/deepseek2.cpp
@@ -2,11 +2,14 @@
 llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
-    const bool is_mla = hparams.is_mla();
+    // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
    bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
    const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
    // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
-    const int64_t n_embd_head_k = hparams.n_embd_head_k_mla();
+    const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
-    const int64_t n_embd_head_v = hparams.n_embd_head_v_mla();
+    const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
    const int64_t n_embd_head_qk_rope = hparams.n_rot;
    const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
@@ -40,8 +43,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();
-    auto * inp_attn_kv = !is_mla ? build_attn_inp_kv() : nullptr;
+    auto * inp_attn = build_attn_inp_kv();
    auto * inp_attn_k  =  is_mla ? build_attn_inp_k()  : nullptr;
    ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -55,9 +57,6 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
        // self_attention
        {
            ggml_tensor * q = NULL;
            const bool is_lite = model.layers[il].wq;
            if (!is_lite) {
                q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
                cb(q, "q", il);
@@ -125,14 +124,14 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
                // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
                // note: rope must go first for in-place context shifting in build_rope_shift()
-                ggml_tensor * Qcur = ggml_concat(ctx0, q_nope_absorbed, q_pe, 0);
+                ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
                cb(Qcur, "Qcur", il);
                kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
                cb(kv_cmpr, "kv_cmpr_reshape", il);
                // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
-                ggml_tensor * Kcur = ggml_concat(ctx0, kv_cmpr, k_pe, 0);
+                ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
                cb(Kcur, "Kcur", il);
                // {kv_lora_rank, 1, n_tokens}
@@ -146,7 +145,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
                }
                // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
-                cur = build_attn(inp_attn_k,
+                cur = build_attn(inp_attn,
                        model.layers[il].wo, NULL,
                        Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
            } else {
@@ -170,10 +169,11 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
                Vcur = ggml_cont(ctx0, Vcur);
                cb(Vcur, "Vcur_cont", il);
-                ggml_tensor * Qcur = ggml_concat(ctx0, q_nope, q_pe, 0);
+                // note: rope must go first for in-place context shifting in build_rope_shift()
                ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0);
                cb(Qcur, "Qcur", il);
-                ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
+                ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
                cb(Kcur, "Kcur", il);
                if (inp_attn_scale) {
@@ -183,7 +183,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
                }
                // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
-                cur = build_attn(inp_attn_kv,
+                cur = build_attn(inp_attn,
                            model.layers[il].wo, NULL,
                            Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
            }
@@ -215,7 +215,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
                model.layers[il].ffn_exp_probs_b,
                n_expert, n_expert_used,
                LLM_FFN_SILU, hparams.expert_weights_norm,
-                hparams.expert_weights_scale, hparams.expert_weights_scale,
+                true, hparams.expert_weights_scale,
                (llama_expert_gating_func_type) hparams.expert_gating_func,
                il);
            cb(moe_out, "ffn_moe_out", il);
--- a/llama/llama.cpp/src/models/exaone-moe.cpp
+++ b/llama/llama.cpp/src/models/exaone-moe.cpp
@@ -1,146 +0,0 @@
 #include "models.h"
 llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
    const int64_t n_embd_head = hparams.n_embd_head_k;
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
    GGML_ASSERT(n_embd_head == hparams.n_rot);
    ggml_tensor * cur;
    ggml_tensor * inpL;
    inpL = build_inp_embd(model.tok_embd);
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();
    auto * inp_attn_iswa = build_attn_inp_kv_iswa();
    ggml_tensor * inp_out_ids = build_inp_out_ids();
    const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
    for (int il = 0; il < n_transformer_layers; ++il) {
        ggml_tensor * inpSA = inpL;
        // use RoPE for SWA layers
        const bool is_local_layer = hparams.is_swa(il);
        // norm
        cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
        cb(cur, "attn_norm", il);
        // self-attention
        {
            ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
            // compute Q and K and RoPE them
            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
            cb(Qcur, "Qcur", il);
            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
            cb(Kcur, "Kcur", il);
            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
            cb(Vcur, "Vcur", il);
            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
            Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
            Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
            cb(Qcur, "Qcur_normed", il);
            cb(Kcur, "Kcur_normed", il);
            if (is_local_layer) {
                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base,
                                     freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
                Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base,
                                     freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
            }
            cb(Qcur, "Qcur", il);
            cb(Kcur, "Kcur", il);
            cb(Vcur, "Vcur", il);
            cur = build_attn(inp_attn_iswa,
                model.layers[il].wo, NULL,
                Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
            cb(cur, "attn_out", il);
        }
        if (il == n_transformer_layers - 1 && inp_out_ids) {
            cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
        }
        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
        cb(ffn_inp, "ffn_inp", il);
        // norm
        cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
        cb(cur, "ffn_norm", il);
        // feed-forward network
        if (model.layers[il].ffn_gate_inp == nullptr) {
            // dense branch
            cur = build_ffn(cur,
                    model.layers[il].ffn_up, NULL, NULL,
                    model.layers[il].ffn_gate, NULL, NULL,
                    model.layers[il].ffn_down, NULL, NULL, NULL,
                    LLM_FFN_SILU, LLM_FFN_PAR, il);
            cb(cur, "ffn_out", il);
        } else {
            // MoE branch
            ggml_tensor * moe_out = build_moe_ffn(cur,
                model.layers[il].ffn_gate_inp,
                model.layers[il].ffn_up_exps,
                model.layers[il].ffn_gate_exps,
                model.layers[il].ffn_down_exps,
                model.layers[il].ffn_exp_probs_b,
                n_expert, n_expert_used,
                LLM_FFN_SILU, hparams.expert_weights_norm,
                true, hparams.expert_weights_scale,
                (llama_expert_gating_func_type) hparams.expert_gating_func,
                il);
            cb(moe_out, "ffn_moe_out", il);
            // FFN shared expert
            {
                ggml_tensor * ffn_shexp =
                    build_ffn(cur,
                        model.layers[il].ffn_up_shexp, NULL, NULL,
                        model.layers[il].ffn_gate_shexp, NULL, NULL,
                        model.layers[il].ffn_down_shexp, NULL, NULL,
                        NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
                cb(ffn_shexp, "ffn_shexp", il);
                cur = ggml_add(ctx0, moe_out, ffn_shexp);
                cb(cur, "ffn_out", il);
            }
        }
        cur = ggml_add(ctx0, cur, ffn_inp);
        cur = build_cvec(cur, il);
        cb(cur, "l_out", il);
        // input for next layer
        inpL = cur;
    }
    cur = inpL;
    // final norm
    cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
    cb(cur, "result_norm", -1);
    res->t_embd = cur;
    // lm_head
    cur = build_lora_mm(model.output, cur);
    cb(cur, "result_output", -1);
    res->t_logits = cur;
    ggml_build_forward_expand(gf, cur);
 }
--- a/llama/llama.cpp/src/models/gemma-embedding.cpp
+++ b/llama/llama.cpp/src/models/gemma-embedding.cpp
@@ -1,5 +1,7 @@
 #include "models.h"
 llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
    const int64_t n_embd_head = hparams.n_embd_head_k;
@@ -10,8 +12,10 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model,
    inpL = build_inp_embd(model.tok_embd);
    // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
-    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
+    if (ubatch.token) {
-    cb(inpL, "inp_scaled", -1);
+        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
        cb(inpL, "inp_scaled", -1);
    }
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();
--- a/llama/llama.cpp/src/models/gemma2-iswa.cpp
+++ b/llama/llama.cpp/src/models/gemma2-iswa.cpp
@@ -19,9 +19,6 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
    ggml_tensor * inp_out_ids = build_inp_out_ids();
    for (int il = 0; il < n_layer; ++il) {
        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
        // norm
        cur = build_norm(inpL,
                model.layers[il].attn_norm, NULL,
@@ -46,12 +43,12 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
            Qcur = ggml_rope_ext(
                    ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow);
            Kcur = ggml_rope_ext(
                    ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow);
            cb(Qcur, "Qcur", il);
--- a/llama/llama.cpp/src/models/gemma3.cpp
+++ b/llama/llama.cpp/src/models/gemma3.cpp
@@ -10,9 +10,10 @@ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_gr
    inpL = build_inp_embd(model.tok_embd);
    // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
-    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
+    if (ubatch.token) {
-    cb(inpL, "inp_scaled", -1);
+        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
-
+        cb(inpL, "inp_scaled", -1);
    }
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();
--- a/llama/llama.cpp/src/models/gemma3n-iswa.cpp
+++ b/llama/llama.cpp/src/models/gemma3n-iswa.cpp
@@ -1,5 +1,7 @@
 #include "models.h"
 llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params),
    model(model),
@@ -13,9 +15,10 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
    inpL = build_inp_embd(model.tok_embd);
    // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
-    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
+    if (ubatch.token) {
-    cb(inpL, "inp_scaled", -1);
+        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
-
+        cb(inpL, "inp_scaled", -1);
    }
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();
@@ -245,30 +248,20 @@ ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) {
 // equivalent to get_per_layer_inputs() in python code
 // output shape: [n_embd_altup, n_layer, n_tokens]
 ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
-    auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
+    auto          inp = std::make_unique<llm_graph_input_embd>();
    ggml_tensor * inp_per_layer;
    if (ubatch.token) {
        inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
        ggml_set_input(inp->tokens);
-        res->t_inp_tokens = inp->tokens;
+        res->t_tokens = inp->tokens;
        inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens);
        inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
        inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
        cb(inp_per_layer, "inp_per_layer_selected", -1);
        res->add_input(std::move(inp));
    } else {
-        // Vision embedding path: use padding token (ID=0) embedding
+        GGML_ABORT("TODO: support embd input");
        // TODO: verify if this is the correct behavior in transformers implementation
        const int64_t embd_size = model.tok_embd_per_layer->ne[0];  // n_embd_altup * n_layer
        // Extract and dequantize padding token embedding (row 0)
        ggml_tensor * padding = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
        inp_per_layer = ggml_cast(ctx0, padding, GGML_TYPE_F32);
        // Reshape to [n_embd_altup, n_layer, 1]
        inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1);
        cb(inp_per_layer, "inp_per_layer_vision", -1);
    }
    res->add_input(std::move(inp));
    return inp_per_layer;
 }
@@ -286,7 +279,7 @@ ggml_tensor * llm_build_gemma3n_iswa::project_per_layer_inputs(ggml_tensor * inp
                                              -1);  // [n_embd_altup, n_layer, n_tokens]
    cb(per_layer_proj, "per_layer_proj", -1);
-    inp_per_layer = ggml_add(ctx0, per_layer_proj, inp_per_layer);
+    inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
    inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
    cb(inp_per_layer, "inp_per_layer", -1);
--- a/llama/llama.cpp/src/models/llama-iswa.cpp
+++ b/llama/llama.cpp/src/models/llama-iswa.cpp
@@ -25,12 +25,8 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
    ggml_tensor * inp_out_ids = build_inp_out_ids();
    for (int il = 0; il < n_layer; ++il) {
        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
        ggml_tensor * inpSA = inpL;
        // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
        const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
                              (il + 1) % hparams.n_no_rope_layer_step != 0;
@@ -71,13 +67,13 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
            if (use_rope) {
                Qcur = ggml_rope_ext(
                        ctx0, Qcur, inp_pos, rope_factors,
-                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow
                        );
                Kcur = ggml_rope_ext(
                        ctx0, Kcur, inp_pos, rope_factors,
-                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow
                        );
            } else if (inp_attn_scale) {
--- a/llama/llama.cpp/src/models/llama.cpp
+++ b/llama/llama.cpp/src/models/llama.cpp
@@ -1,7 +1,6 @@
 #include "models.h"
-template <bool embed>
+llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
 llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
    const int64_t n_embd_head = hparams.n_embd_head_v;
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -15,14 +14,7 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();
-    using inp_attn_type = std::conditional_t<embed, llm_graph_input_attn_no_cache, llm_graph_input_attn_kv>;
+    auto * inp_attn = build_attn_inp_kv();
    inp_attn_type * inp_attn = nullptr;
    if constexpr (embed) {
        inp_attn = build_attn_inp_no_cache();
    } else {
        inp_attn = build_attn_inp_kv();
    }
    const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
@@ -153,16 +145,11 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
    cb(cur, "result_norm", -1);
    res->t_embd = cur;
-    if constexpr (!embed) {
+    // lm_head
-        // lm_head
+    cur = build_lora_mm(model.output, cur);
        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
+    cb(cur, "result_output", -1);
-        res->t_logits = cur;
+    res->t_logits = cur;
    }
    ggml_build_forward_expand(gf, cur);
 }
 template struct llm_build_llama<false>;
 template struct llm_build_llama<true>;
--- a/llama/llama.cpp/src/models/maincoder.cpp
+++ b/llama/llama.cpp/src/models/maincoder.cpp
@@ -1,117 +0,0 @@
 #include "models.h"
 llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
    const int64_t n_embd_head = hparams.n_embd_head_v;
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
    GGML_ASSERT(n_embd_head == hparams.n_rot);
    ggml_tensor * cur;
    ggml_tensor * inpL;
    inpL = build_inp_embd(model.tok_embd);
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();
    auto * inp_attn = build_attn_inp_kv();
    ggml_tensor * inp_out_ids = build_inp_out_ids();
    for (int il = 0; il < n_layer; ++il) {
        ggml_tensor * inpSA = inpL;
        // norm
        cur = build_norm(inpL,
                model.layers[il].attn_norm, NULL,
                LLM_NORM_RMS, il);
        cb(cur, "attn_norm", il);
        // self-attention
        {
            // compute Q and K and RoPE them
            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
            cb(Qcur, "Qcur", il);
            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
            cb(Kcur, "Kcur", il);
            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
            cb(Vcur, "Vcur", il);
            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
            Qcur = ggml_rope_ext(
                    ctx0, Qcur, inp_pos, nullptr,
                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                    );
            Kcur = ggml_rope_ext(
                    ctx0, Kcur, inp_pos, nullptr,
                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                    );
            Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
            cb(Qcur, "Qcur_normed", il);
            Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
            cb(Kcur, "Kcur_normed", il);
            cb(Qcur, "Qcur", il);
            cb(Kcur, "Kcur", il);
            cb(Vcur, "Vcur", il);
            cur = build_attn(inp_attn,
                    model.layers[il].wo, model.layers[il].bo,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        }
        if (il == n_layer - 1 && inp_out_ids) {
            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
        }
        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
        cb(ffn_inp, "ffn_inp", il);
        // feed-forward network
        cur = build_norm(ffn_inp,
                model.layers[il].ffn_norm, NULL,
                LLM_NORM_RMS, il);
        cb(cur, "ffn_norm", il);
        cur = build_ffn(cur,
                model.layers[il].ffn_up,   NULL, NULL,
                model.layers[il].ffn_gate, NULL, NULL,
                model.layers[il].ffn_down, NULL, NULL,
                NULL,
                LLM_FFN_SILU, LLM_FFN_PAR, il);
        cb(cur, "ffn_out", il);
        cur = ggml_add(ctx0, cur, ffn_inp);
        cur = build_cvec(cur, il);
        cb(cur, "l_out", il);
        // input for next layer
        inpL = cur;
    }
    cur = inpL;
    cur = build_norm(cur,
            model.output_norm, NULL,
            LLM_NORM_RMS, -1);
    cb(cur, "result_norm", -1);
    res->t_embd = cur;
    // lm_head
    cur = build_lora_mm(model.output, cur);
    cb(cur, "result_output", -1);
    res->t_logits = cur;
    ggml_build_forward_expand(gf, cur);
 }
--- a/llama/llama.cpp/src/models/mimo2-iswa.cpp
+++ b/llama/llama.cpp/src/models/mimo2-iswa.cpp
@@ -1,123 +0,0 @@
 #include "models.h"
 llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
    ggml_tensor * cur;
    ggml_tensor * inpL;
    inpL = build_inp_embd(model.tok_embd);
    ggml_tensor * inp_pos = build_inp_pos();
    auto * inp_attn = build_attn_inp_kv_iswa();
    ggml_tensor * inp_out_ids = build_inp_out_ids();
    for (int il = 0; il < n_layer; ++il) {
        ggml_tensor * inpSA = inpL;
        uint32_t n_head_l    = hparams.n_head(il);
        uint32_t n_head_kv_l = hparams.n_head_kv(il);
        const float freq_base_l  = model.get_rope_freq_base(cparams, il);
        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
        cur = inpL;
        // self_attention
        {
            cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
            cb(cur, "attn_norm", il);
            // compute Q and K and RoPE them
            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
            cb(Qcur, "Qcur", il);
            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
            cb(Kcur, "Kcur", il);
            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
            cb(Vcur, "Vcur", il);
            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head_l,    n_tokens);
            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv_l, n_tokens);
            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv_l, n_tokens);
            Qcur = ggml_rope_ext(
                ctx0, Qcur, inp_pos, nullptr,
                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
                ext_factor, attn_factor, beta_fast, beta_slow
                );
            Kcur = ggml_rope_ext(
                ctx0, Kcur, inp_pos, nullptr,
                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
                ext_factor, attn_factor, beta_fast, beta_slow
                );
            cb(Qcur, "Qcur", il);
            cb(Kcur, "Kcur", il);
            cb(Vcur, "Vcur", il);
            ggml_tensor * sinks = model.layers[il].attn_sinks;
            cur = build_attn(inp_attn,
                    model.layers[il].wo, NULL,
                    Qcur, Kcur, Vcur, nullptr, sinks, nullptr, 1.0f/sqrtf(float(n_embd_head_k)), il);
        }
        if (il == n_layer - 1 && inp_out_ids) {
            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
        }
        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
        cb(ffn_inp, "ffn_inp", il);
        cur = build_norm(ffn_inp,
                model.layers[il].ffn_norm, NULL,
                LLM_NORM_RMS, il);
        cb(cur, "ffn_norm", il);
        // feed-forward network
        if (model.layers[il].ffn_gate_inp == nullptr) {
            // dense branch
            cur = build_ffn(cur,
                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                    model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                    NULL,
                    LLM_FFN_SILU, LLM_FFN_PAR, il);
            cb(cur, "ffn_out", il);
        } else {
            // MoE branch
            cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
                                model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
                                model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false,
                                0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, il);
            cb(cur, "ffn_moe_out", il);
        }
        cur = ggml_add(ctx0, cur, ffn_inp);
        cur = build_cvec(cur, il);
        cb(cur, "l_out", il);
        // input for next layer
        inpL = cur;
    }
    cur = inpL;
    cur = build_norm(cur,
            model.output_norm, NULL,
            LLM_NORM_RMS, -1);
    cb(cur, "result_norm", -1);
    res->t_embd = cur;
    // lm_head
    cur = build_lora_mm(model.output, cur);
    cb(cur, "result_output", -1);
    res->t_logits = cur;
    ggml_build_forward_expand(gf, cur);
 }
--- a/llama/llama.cpp/src/models/minicpm3.cpp
+++ b/llama/llama.cpp/src/models/minicpm3.cpp
@@ -9,7 +9,6 @@ llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_grap
    const uint32_t n_embd_head_qk_rope = hparams.n_rot;
    const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
    const uint32_t kv_lora_rank = hparams.n_lora_kv;
    ggml_tensor * cur;
--- a/llama/llama.cpp/src/models/models.h
+++ b/llama/llama.cpp/src/models/models.h
@@ -167,10 +167,6 @@ struct llm_build_exaone : public llm_graph_context {
    llm_build_exaone(const llama_model & model, const llm_graph_params & params);
 };
 struct llm_build_exaone_moe : public llm_graph_context {
    llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params);
 };
 struct llm_build_falcon : public llm_graph_context {
    llm_build_falcon(const llama_model & model, const llm_graph_params & params);
 };
@@ -307,7 +303,6 @@ struct llm_build_llada_moe : public llm_graph_context {
    llm_build_llada_moe(const llama_model & model, const llm_graph_params & params);
 };
 template <bool embed>
 struct llm_build_llama : public llm_graph_context {
    llm_build_llama(const llama_model & model, const llm_graph_params & params);
 };
@@ -316,18 +311,10 @@ struct llm_build_llama_iswa : public llm_graph_context {
    llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params);
 };
 struct llm_build_maincoder : public llm_graph_context {
    llm_build_maincoder(const llama_model & model, const llm_graph_params & params);
 };
 struct llm_build_mamba : public llm_graph_context_mamba {
    llm_build_mamba(const llama_model & model, const llm_graph_params & params);
 };
 struct llm_build_mimo2_iswa : public llm_graph_context {
    llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params);
 };
 struct llm_build_minicpm3 : public llm_graph_context {
    llm_build_minicpm3(const llama_model & model, const llm_graph_params & params);
 };
@@ -340,10 +327,6 @@ struct llm_build_mistral3 : public llm_graph_context {
    llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
 };
 struct llm_build_modern_bert : public llm_graph_context {
    llm_build_modern_bert(const llama_model & model, const llm_graph_params & params);
 };
 struct llm_build_mpt : public llm_graph_context {
    llm_build_mpt(const llama_model & model, const llm_graph_params & params);
 };
@@ -413,11 +396,6 @@ struct llm_build_plamo : public llm_graph_context {
    llm_build_plamo(const llama_model & model, const llm_graph_params & params);
 };
 template <bool iswa>
 struct llm_build_plamo3 : public llm_graph_context {
    llm_build_plamo3(const llama_model & model, const llm_graph_params & params);
 };
 struct llm_build_plm : public llm_graph_context {
    llm_build_plm(const llama_model & model, const llm_graph_params & params);
 };
@@ -470,8 +448,7 @@ private:
                ggml_tensor * cur,
                        int   il);
-    // returns pair of output and new state
+    ggml_tensor * build_delta_net_chunking(
    std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking(
                ggml_tensor * q,
                ggml_tensor * k,
                ggml_tensor * v,
@@ -483,8 +460,7 @@ private:
                ggml_tensor * diag_mask,
                        int   il);
-    // returns pair of output and new state
+    ggml_tensor * build_delta_net_autoregressive(
    std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive(
                ggml_tensor * q,
                ggml_tensor * k,
                ggml_tensor * v,
@@ -499,11 +475,6 @@ private:
                ggml_tensor * gate,
                        int   layer);
    // returns pair of qkv, z
    std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
                ggml_tensor * input,
                        int   il);
    const llama_model & model;
 };
--- a/llama/llama.cpp/src/models/modern-bert.cpp
+++ b/llama/llama.cpp/src/models/modern-bert.cpp
@@ -1,116 +0,0 @@
 #include "models.h"
 llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
    const int64_t n_embd_head = hparams.n_embd_head_v;
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
    ggml_tensor * cur;
    ggml_tensor * inpL;
    ggml_tensor * inp_pos = build_inp_pos();
    // construct input embeddings (token, type, position)
    inpL = build_inp_embd(model.tok_embd);
    cb(inpL, "inp_embd", -1);
    // embed layer norm
    inpL = build_norm(inpL, model.tok_norm, nullptr, LLM_NORM, -1);
    cb(inpL, "inp_norm", -1);
    ggml_tensor * inp_out_ids = build_inp_out_ids();
    auto * inp_attn = build_attn_inp_no_cache();
    for (int il = 0; il < n_layer; ++il) {
        const float freq_base_l  = model.get_rope_freq_base(cparams, il);
        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
        cur = inpL;
        // attention layer norm
        if (model.layers[il].attn_norm) {
            cur = build_norm(inpL,
                    model.layers[il].attn_norm, NULL,
                    LLM_NORM, il);
            cb(cur, "attn_norm", il);
        }
        // self attention
        cur = build_lora_mm(model.layers[il].wqkv, cur);
        cb(cur, "wqkv", il);
        const size_t type_size = ggml_type_size(cur->type);
        ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head,    n_tokens, n_embd_head*type_size, cur->nb[1], 0*type_size*(n_embd));
        ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd));
        ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd + n_embd_gqa));
        // RoPE
        Qcur = ggml_rope_ext(
                ctx0, Qcur, inp_pos, nullptr,
                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
                ext_factor, attn_factor, beta_fast, beta_slow
                );
        Kcur = ggml_rope_ext(
                ctx0, Kcur, inp_pos, nullptr,
                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
                ext_factor, attn_factor, beta_fast, beta_slow
                );
        cb(Qcur, "Qcur", il);
        cb(Kcur, "Kcur", il);
        cb(Vcur, "Vcur", il);
        cur = build_attn(inp_attn,
                    model.layers[il].wo, nullptr,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        cb(cur, "kqv_out", il);
        if (il == n_layer - 1 && inp_out_ids) {
            cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
            inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
        }
        // re-add the layer input
        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
        cb(ffn_inp, "ffn_inp", il);
        // attention layer norm
        cur = build_norm(ffn_inp,
                model.layers[il].ffn_norm, NULL,
                LLM_NORM, il);
        cb(cur, "ffn_norm", il);
        cur = build_ffn(cur,
                model.layers[il].ffn_up,   NULL, NULL,
                NULL,                      NULL, NULL,
                model.layers[il].ffn_down, NULL, NULL,
                NULL,
                LLM_FFN_GEGLU, LLM_FFN_SEQ, il);
        // attentions bypass the intermediate layer
        cur = ggml_add(ctx0, cur, ffn_inp);
        // input for next layer
        inpL = cur;
    }
    cur = inpL;
    cur = build_norm(cur,
            model.output_norm, NULL,
            LLM_NORM, -1);
    cb(cur, "final_norm_out", -1);
    if (hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
        // extracting cls token
        cur = ggml_view_1d(ctx0, cur, hparams.n_embd, 0);
        cb(cur, "cls_pooled_embd", -1);
    }
    cb(cur, "res_embd", -1);
    res->t_embd = cur;
    ggml_build_forward_expand(gf, cur);
 }
--- a/llama/llama.cpp/src/models/nemotron-h.cpp
+++ b/llama/llama.cpp/src/models/nemotron-h.cpp
@@ -67,7 +67,7 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
                                                          const llama_model &       model,
                                                          const int64_t             n_embd_head,
                                                          const int                 il) {
-    // compute Q and K
+    // compute Q and K and (optionally) RoPE them
    ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
    cb(Qcur, "Qcur", il);
    if (model.layers[il].bq) {
--- a/llama/llama.cpp/src/models/openai-moe-iswa.cpp
+++ b/llama/llama.cpp/src/models/openai-moe-iswa.cpp
@@ -14,9 +14,6 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
    ggml_tensor * inp_out_ids = build_inp_out_ids();
    for (int il = 0; il < n_layer; ++il) {
        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
        ggml_tensor * inpSA = inpL;
        // norm
@@ -52,13 +49,13 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
            Qcur = ggml_rope_ext(
                    ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                    );
            Kcur = ggml_rope_ext(
                    ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                    );
--- a/llama/llama.cpp/src/models/plamo3.cpp
+++ b/llama/llama.cpp/src/models/plamo3.cpp
@@ -1,128 +0,0 @@
 #include "models.h"
 template <bool iswa>
 llm_build_plamo3<iswa>::llm_build_plamo3(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
    const int64_t head_dim_q = hparams.n_embd_head_k;
    const int64_t head_dim_v = hparams.n_embd_head_v;
    ggml_tensor * cur;
    ggml_tensor * inpL = build_inp_embd(model.tok_embd);
    ggml_tensor * inp_pos = build_inp_pos();
    using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
    inp_attn_type * inp_attn = nullptr;
    if constexpr (iswa) {
        inp_attn = build_attn_inp_kv_iswa();
    } else {
        inp_attn = build_attn_inp_kv();
    }
    ggml_tensor * inp_out_ids = build_inp_out_ids();
    for (int il = 0; il < n_layer; ++il) {
        ggml_tensor * residual = inpL;
        float freq_base_l  = 0.0f;
        float freq_scale_l = 0.0f;
        if constexpr (iswa) {
            freq_base_l  = model.get_rope_freq_base (cparams, il);
            freq_scale_l = model.get_rope_freq_scale(cparams, il);
        } else {
            freq_base_l  = freq_base;
            freq_scale_l = freq_scale;
        }
        cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
        cb(cur, "attn_norm", il);
        ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
        cb(cur, "wqkv", il);
        const int32_t n_head    = hparams.n_head(il);
        const int32_t n_head_kv = hparams.n_head_kv(il);
        const int64_t q_offset = 0;
        const int64_t k_offset = head_dim_q * n_head;
        const int64_t v_offset = k_offset + head_dim_q * n_head_kv;
        ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, head_dim_q, n_head, n_tokens,
                head_dim_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
        ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, head_dim_q, n_head_kv, n_tokens,
                head_dim_q * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
        ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, head_dim_v, n_head_kv, n_tokens,
                head_dim_v * sizeof(float), qkv->nb[1], v_offset * ggml_element_size(qkv));
        cb(Qcur, "Qcur", il);
        cb(Kcur, "Kcur", il);
        cb(Vcur, "Vcur", il);
        Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
        cb(Qcur, "attn_q_norm", il);
        Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
        cb(Kcur, "attn_k_norm", il);
        Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr,
                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
                ext_factor, attn_factor, beta_fast, beta_slow);
        Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr,
                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
                ext_factor, attn_factor, beta_fast, beta_slow);
        const float attn_scale = 1.0f / sqrtf(float(head_dim_q));
        cur = build_attn(inp_attn,
                model.layers[il].wo, NULL,
                Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, attn_scale, il);
        cb(cur, "attn_out", il);
        if (il == n_layer - 1 && inp_out_ids) {
            cur      = ggml_get_rows(ctx0, cur, inp_out_ids);
            residual = ggml_get_rows(ctx0, residual, inp_out_ids);
        }
        cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
        cb(cur, "attn_post_norm", il);
        cur = ggml_add(ctx0, cur, residual);
        cb(cur, "attn_residual", il);
        residual = cur;
        cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
        cb(cur, "ffn_norm", il);
        cur = build_ffn(cur,
                model.layers[il].ffn_up,   NULL, NULL,
                NULL,                      NULL, NULL,
                model.layers[il].ffn_down, NULL, NULL,
                NULL,
                LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
        cb(cur, "ffn_out", il);
        cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
        cb(cur, "ffn_post_norm", il);
        cur = ggml_add(ctx0, cur, residual);
        cb(cur, "ffn_residual", il);
        cur = build_cvec(cur, il);
        cb(cur, "l_out", il);
        inpL = cur;
    }
    cur = inpL;
    cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
    res->t_embd = cur;
    cur = build_lora_mm(model.output, cur);
    res->t_logits = cur;
    ggml_build_forward_expand(gf, cur);
 }
 // Explicit template instantiations
 template struct llm_build_plamo3<false>;
 template struct llm_build_plamo3<true>;
--- a/llama/llama.cpp/src/models/plm.cpp
+++ b/llama/llama.cpp/src/models/plm.cpp
@@ -5,7 +5,6 @@ llm_build_plm::llm_build_plm(const llama_model & model, const llm_graph_params &
    const uint32_t n_embd_head_qk_rope = hparams.n_rot;
    const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
    const uint32_t kv_lora_rank = hparams.n_lora_kv;
    ggml_tensor * cur;
--- a/llama/llama.cpp/src/models/qwen3next.cpp
+++ b/llama/llama.cpp/src/models/qwen3next.cpp
@@ -86,15 +86,7 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
    ggml_build_forward_expand(gf, cur);
 }
-// utility to get one slice from the third dimension
+ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
 // input dim:  [x, y, c, b]
 // output dim: [x, y, 1, b]
 static ggml_tensor * get_slice_2d(ggml_context * ctx0, ggml_tensor * t, int64_t c) {
    return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3],
        t->nb[1], t->nb[2], t->nb[3], t->nb[2] * c);
 }
 std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chunking(
        ggml_tensor * q,
        ggml_tensor * k,
        ggml_tensor * v,
@@ -195,16 +187,18 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chu
    beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
    ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
    cb(g_cumsum, "g_cumsum", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
-    ggml_tensor * gcs_i = g_cumsum; // ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
+    cb(g_cumsum, "g_cumsum", il);
    ggml_tensor * gcs_i = ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
    ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
    ggml_tensor * gcs_j_broadcast =
        ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
    ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
-    cb(decay_mask, "decay_mask", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
+
    cb(decay_mask, "decay_mask", il);
    decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
    decay_mask = ggml_exp(ctx0, decay_mask);
@@ -214,7 +208,8 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chu
    ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
    ggml_tensor * attn    = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
-    cb(attn, "attn_pre_solve", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
+
    cb(attn, "attn_pre_solve", il);
    ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
    ggml_tensor * lhs        = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
@@ -222,7 +217,8 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chu
    ggml_tensor * lin_solve  = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
    attn                     = ggml_mul(ctx0, lin_solve, causal_mask);
    attn                     = ggml_add(ctx0, attn, identity);
-    cb(attn, "attn_solved", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
+
    cb(attn, "attn_solved", il);
    v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);
@@ -230,126 +226,116 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chu
    ggml_tensor * gexp       = ggml_exp(ctx0, g_cumsum_t);
    ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp);
-    cb(kbeta_gexp, "kbeta_gexp", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
+
    cb(kbeta_gexp, "kbeta_gexp", il);
    ggml_tensor * k_cumdecay =
        ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
    cb(k_cumdecay, "k_cumdecay", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
-    ggml_tensor * attn_kq = ggml_mul_mat(ctx0, k, q);
+    cb(k_cumdecay, "k_cumdecay", il);
    attn_kq = ggml_mul(ctx0, attn_kq, decay_mask);
    attn_kq = ggml_mul(ctx0, attn_kq, diag_mask);
    cb(attn_kq, "attn_kq", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
    // vectorized calculation of key_gdiff
    // improved from the chunked version:
    //   g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
    //   g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
    //   key_gdiff = key * g_diff.unsqueeze(-1)
    //   kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
    //   last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
    // get last element in g_cumsum along chunk_size dimension (ne0)
    // example: [[x, y, z, ..., last], ...] -> [[last], ...]
    ggml_tensor * g_last = ggml_view_4d(ctx0, g_cumsum, 1, 1, g_cumsum->ne[2], g_cumsum->ne[3],
                                        g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3],
                                        (g_cumsum->ne[0] - 1) * ggml_element_size(g_cumsum));
    g_last = ggml_cont(ctx0, g_last);
    cb(g_last, "g_last", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
    ggml_tensor * g_last_exp = ggml_exp(ctx0, g_last);
    cb(g_last_exp, "g_last_exp", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
    ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum, g_last));
    cb(g_diff, "g_diff", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
    ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
    ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp);
    cb(key_gdiff, "key_gdiff", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
    // state to be updated per chunk
    ggml_tensor * new_state = state; // ggml_dup(ctx0, state);
    cb(new_state, "new_state", il); // shape: (S_v, S_v, H_v, n_seqs)
    // shape after loop of chunks: (S_v, chunk_size, n_chunks, H_v * n_seqs)
    ggml_tensor * core_attn_out = nullptr;
    ggml_tensor * new_state = ggml_dup(ctx0, state);
    cb(new_state, "new_state", il);
    for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
-        // shape: (S_k, chunk_size, 1, H_k * n_seqs)
+        auto chunkify = [=](ggml_tensor * t) {
-        ggml_tensor * q_chunk = get_slice_2d(ctx0, q, chunk); // (no cont), next op: ggml_mul
+            return ggml_cont(ctx0, ggml_view_4d(ctx0, t, t->ne[0], chunk_size, 1, t->ne[3],
                t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk));
        };
-        // shape: (S_v, chunk_size, 1, H_v * n_seqs)
+        auto chunkify_g = [=](ggml_tensor * t) {
-        ggml_tensor * v_chunk = get_slice_2d(ctx0, v, chunk); // (no cont), next op: ggml_repeat
+            return ggml_cont(ctx0, ggml_view_4d(ctx0, t, chunk_size, t->ne[1], 1, t->ne[3],
                t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk));
        };
-        // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
+        ggml_tensor * k_chunk = chunkify(k);
-        ggml_tensor * gexp_chunk = get_slice_2d(ctx0, gexp, chunk); // (no cont), next op: ggml_mul
+        ggml_tensor * q_chunk = chunkify(q);
        ggml_tensor * v_chunk = chunkify(v);
-        // shape: (chunk_size, 1, H_v * n_seqs)
+        ggml_tensor * g_cs_chunk = chunkify_g(g_cumsum);
-        ggml_tensor * k_cumdecay_chunk = get_slice_2d(ctx0, k_cumdecay, chunk); // (no cont), next op: ggml_mul_mat
+        ggml_tensor * g_cs_chunk_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cs_chunk));
        ggml_tensor * decay_mask_chunk = chunkify(decay_mask);
        ggml_tensor * k_cumdecay_chunk = chunkify(k_cumdecay);
        ggml_tensor * gexp_chunk = ggml_exp(ctx0, g_cs_chunk_t);
        // attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
-        // replaced by precomputed attn_kq
+        attn = ggml_mul_mat(ctx0, k_chunk, q_chunk);
-        ggml_tensor * attn_chunk = get_slice_2d(ctx0, attn_kq, chunk);
+        attn = ggml_mul(ctx0, attn, decay_mask_chunk);
-        cb(attn_chunk, "attn_chunk", il);
+        attn = ggml_mul(ctx0, attn, diag_mask);
        ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
        // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
        ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk);
        cb(v_prime, "v_prime_chunk", il); // shape: (S_v, 1, H_v * n_seqs)
        // v_new = v_i - v_prime
        ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v_chunk, v_prime), v_prime);
        ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
        cb(v_new, "v_new_chunk", il);
        // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
        ggml_tensor * q_g_exp    = ggml_mul(ctx0, q_chunk, gexp_chunk);
        ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
        cb(attn_inter, "attn_inter_chunk", il);
        // core_attn_out[:, :, i] = attn_inter + attn @ v_new
-        ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn_chunk);
+        ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn);
        cb(v_attn, "v_attn_chunk", il);
        ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn);
        cb(core_attn_out_chunk, "core_attn_out_chunk", il); // shape: (S_v, chunk_size, 1, H_v * n_seqs)
-        core_attn_out = core_attn_out == nullptr
+        core_attn_out = core_attn_out == nullptr ? core_attn_out_chunk : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 1);
            ? core_attn_out_chunk
            : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 2);
        // g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
        // g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
        // key_gdiff = key * g_diff.unsqueeze(-1)
        // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
        ggml_tensor * k_gdiff = ggml_cont(ctx0, get_slice_2d(ctx0, key_gdiff, chunk));
        //ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, k_gdiff, v_new); // this is slower on metal, why?
        ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, k_gdiff)));
        // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
-        ggml_tensor * gexp_last_chunk = ggml_cont(ctx0, get_slice_2d(ctx0, g_last_exp, chunk));
+
        ggml_tensor * g_cum_last =
            ggml_cont(ctx0, ggml_view_4d(ctx0, g_cs_chunk_t, g_cs_chunk_t->ne[0], 1, g_cs_chunk_t->ne[2], g_cs_chunk_t->ne[3],
                                        g_cs_chunk_t->nb[1], g_cs_chunk_t->nb[2], g_cs_chunk_t->nb[3],
                                        g_cs_chunk_t->nb[0] * (g_cs_chunk_t->ne[1] - 1)));
        ggml_tensor * gexp_last =
            ggml_reshape_4d(ctx0, ggml_exp(ctx0, g_cum_last), 1, 1, g_cum_last->ne[0] * g_cum_last->ne[2], g_cum_last->ne[3]);
        ggml_tensor * g_cum_last_3d =
            ggml_reshape_3d(ctx0, g_cum_last, g_cum_last->ne[0], g_cum_last->ne[2], g_cum_last->ne[3]);
        ggml_tensor * g_cumsum_3d = ggml_reshape_3d(ctx0, g_cs_chunk, g_cs_chunk->ne[0], g_cs_chunk->ne[2], g_cs_chunk->ne[3]);
        ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum_3d, g_cum_last_3d));
        ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
        ggml_tensor * key_gdiff = ggml_mul(ctx0, k_chunk,
                                        ggml_reshape_4d(ctx0, g_diff_exp, 1, g_diff_exp->ne[0], g_diff_exp->ne[1],
                                                        g_diff_exp->ne[2] * g_diff_exp->ne[3]));
        ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff)));
        new_state = ggml_add(ctx0,
-            ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gexp_last_chunk, gexp_last_chunk->ne[0], gexp_last_chunk->ne[1], H_v, n_seqs)),
+            ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gexp_last, gexp_last->ne[0], gexp_last->ne[1], H_v, n_seqs)),
            ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs));
    }
-    // truncate padded tokens
+    core_attn_out = ggml_cont_4d(ctx0, core_attn_out, S_v, chunk_size * n_chunks, H_v, n_seqs);
-    ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out,
+
-            S_v, n_tokens, H_v, n_seqs,
+    ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out, S_v, n_tokens, H_v, n_seqs, core_attn_out->nb[1], core_attn_out->nb[2], core_attn_out->nb[3], 0);
            ggml_row_size(core_attn_out->type, S_v),
            ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks),
            ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0);
    output_tokens = ggml_cont(ctx0, output_tokens);
    cb(output_tokens, "output_tokens", il);
-    // permute back to (S_v, H_v, n_tokens, n_seqs)
+    // flatten output
-    output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3);
+    ggml_tensor * flat_output =
-    output_tokens = ggml_cont(ctx0, output_tokens);
+        ggml_cont_1d(ctx0, ggml_permute(ctx0, output_tokens, 0, 2, 1, 3), S_v * H_v * n_tokens * n_seqs);
-    return {output_tokens, new_state};
+    ggml_tensor * flat_state = ggml_cont_1d(ctx0, new_state, S_v * S_v * H_v * n_seqs);
    return ggml_concat(ctx0, flat_output, flat_state, 0);
 }
-std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_autoregressive(
+ggml_tensor * llm_build_qwen3next::build_delta_net_autoregressive(
        ggml_tensor * q,
        ggml_tensor * k,
        ggml_tensor * v,
@@ -433,7 +419,11 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_aut
    cb(core_attn_out, "output_tokens", il);
    cb(state, "new_state", il);
-    return {core_attn_out, state};
+    // flatten output, no need to permute since n_tokens is 1 so [S_v, 1, H_v, n_seqs] and [S_v, H_v, 1, n_seqs] are equivalent memory-layout wise
    ggml_tensor * flat_output = ggml_reshape_1d(ctx0, core_attn_out, S_v * H_v * n_tokens * n_seqs);
    ggml_tensor * flat_state  = ggml_reshape_1d(ctx0, state, S_v * S_v * H_v * n_seqs);
    return ggml_concat(ctx0, flat_output, flat_state, 0);
 }
 ggml_tensor * llm_build_qwen3next::build_norm_gated(
@@ -533,88 +523,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn(
    return cur;
 }
 std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_qkvz(
                ggml_tensor * input,
                        int   il) {
    const int64_t d_inner      = hparams.ssm_d_inner;
    const int64_t n_seqs       = ubatch.n_seqs;
    const int64_t head_k_dim   = hparams.ssm_d_state;
    const int64_t num_k_heads  = hparams.ssm_n_group;
    const int64_t num_v_heads  = hparams.ssm_dt_rank;
    const int64_t head_v_dim   = d_inner / num_v_heads;
    const int64_t n_seq_tokens = ubatch.n_seq_tokens;
    if (model.layers[il].wqkv) {
        // optimized path
        ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input);
        qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_seq_tokens, n_seqs);
        cb(qkv_mixed, "linear_attn_qkv_mixed", il);
        ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input);
        cb(z, "z", il);
        return { qkv_mixed, z };
    } else {
        // legacy (slower) path
        ggml_tensor * mixed_qkvz = build_lora_mm(model.layers[il].ssm_in, input);
        cb(mixed_qkvz, "linear_attn_mixed_qkvz", il);
        int64_t       qkvz_new_dim        = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads);
        ggml_tensor * mixed_qkvz_reshaped = ggml_reshape_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs);
        // Split mixed_qkvz into query, key, value, z
        int64_t split_sizes_qkvz[4] = {
            head_k_dim,                              // query size
            head_k_dim,                              // key size
            head_v_dim * num_v_heads / num_k_heads,  // value size
            head_v_dim * num_v_heads / num_k_heads   // z size
        };
        ggml_tensor * query =
            ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[0], num_k_heads, n_seq_tokens, n_seqs,
                        mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3], 0);
        cb(query, "q", il);
        ggml_tensor * key = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[1], num_k_heads, n_seq_tokens, n_seqs,
                                        mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
                                        split_sizes_qkvz[0] * ggml_element_size(mixed_qkvz_reshaped));
        cb(key, "k", il);
        ggml_tensor * value =
            ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[2], num_k_heads, n_seq_tokens, n_seqs,
                        mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
                        (split_sizes_qkvz[0] + split_sizes_qkvz[1]) * ggml_element_size(mixed_qkvz_reshaped));
        cb(value, "v", il);
        ggml_tensor * z = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[3], num_k_heads, n_seq_tokens, n_seqs,
                                    mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
                                    (split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * ggml_element_size(mixed_qkvz_reshaped));
        z = ggml_cont(ctx0, z);
        cb(z, "z", il);
        // After creating query, key, and value_reshaped, reshape each to flatten the head dimensions
        // query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
        ggml_tensor * query_flat = ggml_cont_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
        cb(query_flat, "query_flat", il);
        // key: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
        ggml_tensor * key_flat = ggml_cont_3d(ctx0, key, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
        cb(key_flat, "key_flat", il);
        // value_reshaped: [head_v_dim, num_v_heads, n_tokens, n_seqs] -> [head_v_dim * num_v_heads, n_tokens, n_seqs]
        ggml_tensor * value_flat = ggml_cont_3d(ctx0, value, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
        cb(value_flat, "value_flat", il);
        // Now concatenate along the feature dimension (dim 0) to get [conv_dim, n_tokens, n_seqs]
        ggml_tensor * qkv_mixed = ggml_concat(ctx0, query_flat, key_flat, 0);
        qkv_mixed               = ggml_concat(ctx0, qkv_mixed, value_flat, 0);
        cb(qkv_mixed, "qkv_mixed", il);
        return { qkv_mixed, z };
    }
 }
 ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
        llm_graph_input_rs * inp,
        ggml_tensor *        cur,
@@ -639,13 +547,15 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
    GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
    // Input projections
-    auto qkvz = build_qkvz(cur, il);
+    ggml_tensor * mixed_qkvz = build_lora_mm(model.layers[il].ssm_in, cur);
-    ggml_tensor * qkv_mixed = qkvz.first;
+    cb(mixed_qkvz, "linear_attn_mixed_qkvz", il);
    ggml_tensor * z         = qkvz.second;
    ggml_tensor * mixed_ba = build_lora_mm(model.layers[il].ssm_beta_alpha, cur);
    cb(mixed_ba, "linear_attn_mixed_ba", il);
    int64_t       qkvz_new_dim        = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads);
    ggml_tensor * mixed_qkvz_reshaped = ggml_reshape_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs);
    // Reshape mixed_ba: [batch, seq_len, hidden_size] -> [batch, seq_len, num_k_heads, 2*num_v_heads/num_k_heads]
    int64_t       ba_new_dim        = 2 * num_v_heads / num_k_heads;
    ggml_tensor * mixed_ba_reshaped = ggml_reshape_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs);
@@ -665,9 +575,8 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
                                   split_sizes_ba[0] * ggml_element_size(mixed_ba_reshaped));
    cb(a, "a", il);
-    ggml_tensor * beta  = ggml_cont_4d(ctx0, b, num_v_heads, 1, n_seq_tokens, n_seqs);
+    // Reshape b and a to merge head dimensions: [batch, seq_len, num_k_heads, num_v_heads/num_k_heads] -> [batch, seq_len, num_v_heads]
-
+    ggml_tensor * beta  = ggml_cont_3d(ctx0, b, num_v_heads, n_seq_tokens, n_seqs);
    // Reshape a to merge head dimensions: [batch, seq_len, num_k_heads, num_v_heads/num_k_heads] -> [batch, seq_len, num_v_heads]
    ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs);
    ggml_tensor * alpha_biased   = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
@@ -676,6 +585,48 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
    ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a);  // -A_log.exp() * softplus
    cb(gate, "gate", il);
    // Split mixed_qkvz into query, key, value, z
    int64_t split_sizes_qkvz[4] = {
        head_k_dim,                              // query size
        head_k_dim,                              // key size
        head_v_dim * num_v_heads / num_k_heads,  // value size
        head_v_dim * num_v_heads / num_k_heads   // z size
    };
    ggml_tensor * query =
        ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[0], num_k_heads, n_seq_tokens, n_seqs,
                     mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3], 0);
    cb(query, "q", il);
    ggml_tensor * key = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[1], num_k_heads, n_seq_tokens, n_seqs,
                                     mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
                                     split_sizes_qkvz[0] * sizeof(float));
    cb(key, "k", il);
    ggml_tensor * value =
        ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[2], num_k_heads, n_seq_tokens, n_seqs,
                     mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
                     (split_sizes_qkvz[0] + split_sizes_qkvz[1]) * sizeof(float));
    cb(value, "v", il);
    ggml_tensor * z = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[3], num_k_heads, n_seq_tokens, n_seqs,
                                   mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
                                   (split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * sizeof(float));
    cb(z, "z", il);
    // After creating query, key, and value_reshaped, reshape each to flatten the head dimensions
    // query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
    ggml_tensor * query_flat = ggml_cont_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
    cb(query_flat, "query_flat", il);
    // key: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
    ggml_tensor * key_flat = ggml_cont_3d(ctx0, key, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
    cb(key_flat, "key_flat", il);
    // value_reshaped: [head_v_dim, num_v_heads, n_tokens, n_seqs] -> [head_v_dim * num_v_heads, n_tokens, n_seqs]
    ggml_tensor * value_flat = ggml_cont_3d(ctx0, value, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
    cb(value_flat, "value_flat", il);
    // Get convolution states from cache
    ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
    ggml_tensor * ssm_states_all  = mctx_cur->get_s_l(il);
@@ -686,6 +637,17 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
    ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
    cb(conv_states, "conv_states", il);
    // Now concatenate along the feature dimension (dim 0) to get [conv_dim, n_tokens, n_seqs]
    ggml_tensor * qkv_mixed = ggml_concat(ctx0, query_flat, key_flat, 0);
    qkv_mixed               = ggml_concat(ctx0, qkv_mixed, value_flat, 0);
    cb(qkv_mixed, "qkv_mixed", il);
    qkv_mixed = ggml_permute(ctx0, qkv_mixed, 1, 0, 2, 3);
    cb(qkv_mixed, "qkv_mixed_permuted", il);
    // Calculate the total conv dimension
    int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads;
    // Calculate convolution kernel size
    ggml_tensor * conv_kernel      = model.layers[il].ssm_conv1d;
    const int64_t conv_kernel_size = conv_kernel->ne[0];
@@ -693,9 +655,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
    conv_states                    = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
    cb(conv_states, "conv_states_reshaped", il);
    qkv_mixed = ggml_permute(ctx0, qkv_mixed, 1, 0, 2, 3);
    cb(qkv_mixed, "qkv_mixed_permuted", il);
    ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0);
    cb(conv_input, "conv_input", il);
@@ -718,25 +677,26 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
    ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel);
    cb(conv_output_proper, "conv_output_raw", il);
    conv_output_proper = ggml_cont(ctx0, ggml_transpose(ctx0, conv_output_proper));
    cb(conv_output_proper, "conv_output_pre_silu", il);
    ggml_tensor * conv_output_silu = ggml_silu(ctx0, conv_output_proper);
    cb(conv_output_silu, "conv_output_silu", il);
-    ggml_tensor * conv_qkv_mix = conv_output_silu;
+    ggml_tensor * conv_qkv_mix =
-
+        ggml_cont_2d(ctx0, ggml_transpose(ctx0, conv_output_silu), qkv_dim, n_seq_tokens * n_seqs);
-    // Calculate the total conv dimension
+    cb(conv_qkv_mix, "conv_qkv_mix", il);
    int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads;
    int64_t nb1_qkv = ggml_row_size(conv_qkv_mix->type, qkv_dim);
    // Extract the convolved Q, K, V from conv_output
    ggml_tensor * q_conv =
-        ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv, 0);
+        ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, conv_qkv_mix->nb[1], 0);
    cb(q_conv, "q_conv", il);
    ggml_tensor * k_conv =
-        ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv,
+        ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, conv_qkv_mix->nb[1],
                     head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
    cb(k_conv, "k_conv", il);
    ggml_tensor * v_conv =
-        ggml_view_2d(ctx0, conv_qkv_mix, head_v_dim * num_v_heads, n_seq_tokens * n_seqs, nb1_qkv,
+        ggml_view_2d(ctx0, conv_qkv_mix, head_v_dim * num_v_heads, n_seq_tokens * n_seqs, conv_qkv_mix->nb[1],
                     2 * head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
    cb(v_conv, "v_conv", il);
@@ -745,6 +705,8 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
    k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
    v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
    beta = ggml_cont_4d(ctx0, b, num_v_heads, 1, n_seq_tokens, n_seqs);
    ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
    state               = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim * num_v_heads, 1, n_seqs);
    cb(state, "state_predelta", il);
@@ -776,29 +738,45 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
    cb(v_conv, "v_conv_predelta", il);
    // Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens
-    std::pair<ggml_tensor *, ggml_tensor *> attn_out; // pair of (output, new_state)
+    ggml_tensor * attn_out;
    if (n_seq_tokens == 1) {
        attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
    } else {
        attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il);
    }
-    ggml_tensor * output    = attn_out.first;
+    cb(attn_out, "attn_out", il);
-    ggml_tensor * new_state = attn_out.second;
+
-    cb(output, "attn_output", il);
+    // The tensors were concatenated 1d, so we need to extract them 1d as well
-    cb(new_state, "new_state", il);
+    const int64_t output_flat_size = head_v_dim * num_v_heads * n_seq_tokens * n_seqs;
    ggml_tensor * attn_out_1d      = ggml_view_1d(ctx0, attn_out, output_flat_size, 0);
    cb(attn_out_1d, "attn_out_1d", il);
    ggml_tensor * attn_out_final = ggml_cont_4d(ctx0, attn_out_1d, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
    cb(attn_out_final, "attn_out_reshaped", il);
    // Extract the state part (second part of the concatenated tensor)
    // State starts after n_tokens elements along dimension 1
    const int64_t state_flat_size = head_v_dim * head_v_dim * num_v_heads * n_seqs;
    ggml_tensor * state_1d =
        ggml_view_1d(ctx0, attn_out, state_flat_size, output_flat_size * ggml_element_size(attn_out));
    cb(state_1d, "state_1d", il);
    // Update the recurrent states
    ggml_build_forward_expand(gf,
-                              ggml_cpy(ctx0, new_state,
+                              ggml_cpy(ctx0, state_1d,
                                       ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
                                                    kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
    GGML_ASSERT(ggml_nelements(attn_out_1d) + ggml_nelements(state_1d) == ggml_nelements(attn_out));
    // Reshape both attn_out_final and z to 2D tensors for normalization
    // attn_out_final: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
-    ggml_tensor * attn_out_2d_final = ggml_reshape_2d(ctx0, output, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
+    ggml_tensor * attn_out_2d_final =
        ggml_cont_2d(ctx0, attn_out_final, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
    // z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
-    ggml_tensor * z_2d = ggml_reshape_2d(ctx0, z, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
+    ggml_tensor * z_2d = ggml_cont_2d(ctx0, z, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
    // Apply gated normalization: self.norm(core_attn_out, z)
    ggml_tensor * attn_out_norm = build_norm_gated(attn_out_2d_final, model.layers[il].ssm_norm, z_2d, il);
@@ -850,6 +828,12 @@ ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const int
            shared_gate = ggml_sigmoid(ctx0, shared_gate);
            cb(shared_gate, "shared_expert_gate_sigmoid", il);
            // The gate needs to be broadcast to match the dimensions of ffn_shexp
            // ffn_shexp is [n_embd, n_tokens, 1, 1] and shared_gate is [1, n_tokens, 1, 1]
            // We need to repeat the gate along the feature dimension
            shared_gate = ggml_repeat(ctx0, shared_gate, ffn_shexp);
            cb(shared_gate, "shared_expert_gate_broadcast", il);
            // Apply the gate to the shared expert output
            ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate);
            cb(ffn_shexp, "ffn_shexp_gated", il);
--- a/llama/llama.cpp/src/models/qwen3vl-moe.cpp
+++ b/llama/llama.cpp/src/models/qwen3vl-moe.cpp
@@ -2,8 +2,7 @@
 llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
    const size_t n_deepstack_layers = hparams.n_deepstack_layers;
-
+    const int64_t n_embd = hparams.n_embd;
    const int64_t n_embd      = hparams.n_embd;
    const int64_t n_embd_head = hparams.n_embd_head_v;
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -17,6 +16,17 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_
    int sections[4];
    std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
    std::vector<ggml_tensor *> deepstack_features(n_deepstack_layers, nullptr);
    if (ubatch.embd) {
        // Image input: split main embd and deepstack embds
        ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0);
        for (size_t i = 0; i < n_deepstack_layers; i++) {
            deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float));
        }
        inpL = inpL_main;
    }
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();
@@ -110,9 +120,8 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_
        cur = build_cvec(cur, il);
        cb(cur, "l_out", il);
-        if (il < (int) n_deepstack_layers) {
+        if (ubatch.embd && (size_t)il < n_deepstack_layers) {
-            ggml_tensor * ds = ggml_view_2d(ctx0, res->t_inp_embd, n_embd, n_tokens, res->t_inp_embd->nb[1], (il + 1) * n_embd * sizeof(float));
+            cur = ggml_add(ctx0, cur, deepstack_features[il]);
            cur = ggml_add(ctx0, cur, ds);
            cb(cur, "deepstack_out", il);
        }
--- a/llama/llama.cpp/src/models/qwen3vl.cpp
+++ b/llama/llama.cpp/src/models/qwen3vl.cpp
@@ -2,8 +2,7 @@
 llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
    const size_t n_deepstack_layers = hparams.n_deepstack_layers;
-
+    const int64_t n_embd = hparams.n_embd;
    const int64_t n_embd      = hparams.n_embd;
    const int64_t n_embd_head = hparams.n_embd_head_v;
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -17,6 +16,17 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_
    int sections[4];
    std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
    std::vector<ggml_tensor *> deepstack_features(n_deepstack_layers, nullptr);
    if (ubatch.embd) {
        // Image input: split main embd and deepstack embds
        ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0);
        for (size_t i = 0; i < n_deepstack_layers; i++) {
            deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float));
        }
        inpL = inpL_main;
    }
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();
@@ -103,9 +113,8 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_
        cur = build_cvec(cur, il);
        cb(cur, "l_out", il);
-        if (il < (int) n_deepstack_layers) {
+        if (ubatch.embd && (size_t)il < n_deepstack_layers) {
-            ggml_tensor * ds = ggml_view_2d(ctx0, res->t_inp_embd, n_embd, n_tokens, res->t_inp_embd->nb[1], (il + 1) * n_embd * sizeof(float));
+            cur = ggml_add(ctx0, cur, deepstack_features[il]);
            cur = ggml_add(ctx0, cur, ds);
            cb(cur, "deepstack_out", il);
        }
--- a/llama/llama.cpp/src/models/smallthinker.cpp
+++ b/llama/llama.cpp/src/models/smallthinker.cpp
@@ -26,16 +26,10 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
    ggml_tensor * inp_out_ids = build_inp_out_ids();
    for (int il = 0; il < n_layer; ++il) {
        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
        ggml_tensor * inpSA  = inpL;
        ggml_tensor * probs  = nullptr;
-        // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
+        probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL);  // [n_expert, n_tokens]
        const bool use_rope = hparams.n_no_rope_layer_step == n_layer ||
                              il % hparams.n_no_rope_layer_step != 0;
        ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL);  // [n_expert, n_tokens]
        cb(probs, "ffn_moe_logits", il);
        // norm
@@ -58,11 +52,11 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-            if (use_rope) {
+            if (hparams.n_no_rope_layer_step == n_layer || il % hparams.n_no_rope_layer_step != 0) {
-                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                                    ext_factor, attn_factor, beta_fast, beta_slow);
-                Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                                    ext_factor, attn_factor, beta_fast, beta_slow);
            }
            cb(Qcur, "Qcur", il);
--- a/llama/llama.cpp/src/unicode.cpp
+++ b/llama/llama.cpp/src/unicode.cpp
@@ -985,11 +985,6 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
        { "\\p{P}", unicode_cpt_flags::PUNCTUATION },
        { "\\p{M}", unicode_cpt_flags::ACCENT_MARK },
        { "\\p{S}", unicode_cpt_flags::SYMBOL },
        { "\\p{Lu}", unicode_cpt_flags::LETTER }, // Uppercase letter
        { "\\p{Ll}", unicode_cpt_flags::LETTER }, // Lowercase letter
        { "\\p{Lt}", unicode_cpt_flags::LETTER }, // Titlecase letter
        { "\\p{Lm}", unicode_cpt_flags::LETTER }, // Modifier letter
        { "\\p{Lo}", unicode_cpt_flags::LETTER }, // Other letter
    };
    static const std::map<int, int> k_ucat_cpt = {
@@ -1100,26 +1095,22 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                        continue;
                    }
-                    // Match \p{...} Unicode properties of varying lengths
+                    if (regex_expr[i + 0] == '\\' && i + 4 < regex_expr.size() &&
                    if (regex_expr[i + 0] == '\\' && i + 3 < regex_expr.size() &&
                        regex_expr[i + 1] == 'p' &&
-                        regex_expr[i + 2] == '{') {
+                        regex_expr[i + 2] == '{' &&
-                        // Find the closing brace
+                        regex_expr[i + 4] == '}') {
-                        size_t closing_brace = regex_expr.find('}', i + 3);
+                        const std::string pat = regex_expr.substr(i, 5);
-                        if (closing_brace != std::string::npos && closing_brace <= i + 10) { // reasonable limit
+                        if (k_ucat_enum.find(pat) != k_ucat_enum.end()) {
-                            const std::string pat = regex_expr.substr(i, closing_brace - i + 1);
+                            if (!inside) {
-                            if (k_ucat_enum.find(pat) != k_ucat_enum.end()) {
+                                regex_expr_collapsed += '[';
                                if (!inside) {
                                    regex_expr_collapsed += '[';
                                }
                                regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
                                regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
                                if (!inside) {
                                    regex_expr_collapsed += ']';
                                }
                                i = closing_brace;
                                continue;
                            }
                            regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
                            regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
                            if (!inside) {
                                regex_expr_collapsed += ']';
                            }
                            i += 4;
                            continue;
                        }
                    }
--- a/llama/llama.cpp/tools/mtmd/clip-graph.h
+++ b/llama/llama.cpp/tools/mtmd/clip-graph.h
@@ -32,6 +32,10 @@ struct clip_graph {
    const float kq_scale;
    const clip_flash_attn_type flash_attn_type;
    // for debugging
    const bool debug_graph;
    std::vector<ggml_tensor *> & debug_print_tensors;
    ggml_context_ptr ctx0_ptr;
    ggml_context * ctx0;
    ggml_cgraph * gf;
--- a/llama/llama.cpp/tools/mtmd/clip-impl.h
+++ b/llama/llama.cpp/tools/mtmd/clip-impl.h
@@ -45,14 +45,13 @@
 #define KEY_SPATIAL_MERGE_SIZE  "clip.vision.spatial_merge_size"
 #define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers"
-#define KEY_MM_PATCH_MERGE_TYPE    "clip.vision.mm_patch_merge_type"
+#define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
-#define KEY_IMAGE_GRID_PINPOINTS   "clip.vision.image_grid_pinpoints"
+#define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints"
-#define KEY_IMAGE_CROP_RESOLUTION  "clip.vision.image_crop_resolution"
+#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
-#define KEY_WIN_ATTN_PATTERN       "clip.vision.n_wa_pattern"
+#define KEY_WIN_ATTN_PATTERN      "clip.vision.n_wa_pattern"
-#define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes"
+#define KEY_ATTN_WINDOW_SIZE      "clip.vision.window_size"
-#define KEY_ATTN_WINDOW_SIZE       "clip.vision.window_size"
+#define KEY_MINICPMV_VERSION      "clip.minicpmv_version"
-#define KEY_MINICPMV_VERSION       "clip.minicpmv_version"
+#define KEY_MINICPMV_QUERY_NUM    "clip.minicpmv_query_num"
 #define KEY_MINICPMV_QUERY_NUM     "clip.minicpmv_query_num"
 // audio-specific
 #define KEY_AUDIO_PROJ_TYPE     "clip.audio.projector_type" // for models with mixed modalities
@@ -139,62 +138,6 @@
 #define TN_TOK_BOI         "v.boi"
 #define TN_TOK_EOI         "v.eoi"
 // (conformer) lfm2
 #define TN_PRE_ENCODE_OUT  "a.pre_encode.out.%s"
 #define TN_FFN_NORM        "%s.blk.%d.ffn_norm.%s"
 #define TN_FFN_NORM_1      "%s.blk.%d.ffn_norm_1.%s"
 #define TN_FFN_UP_1        "%s.blk.%d.ffn_up_1.%s"
 #define TN_FFN_DOWN_1      "%s.blk.%d.ffn_down_1.%s"
 #define TN_POS_BIAS_U      "%s.blk.%d.pos_bias_u"
 #define TN_POS_BIAS_V      "%s.blk.%d.pos_bias_v"
 #define TN_NORM_CONV       "%s.blk.%d.norm_conv.%s"
 #define TN_LINEAR_POS      "%s.blk.%d.linear_pos.%s"
 #define TN_CONV_DW         "%s.blk.%d.conv_dw.%s"
 #define TN_CONV_NORM       "%s.blk.%d.conv_norm.%s"
 #define TN_CONV_PW1        "%s.blk.%d.conv_pw1.%s"
 #define TN_CONV_PW2        "%s.blk.%d.conv_pw2.%s"
 // mobilenetv5 (gemma3n) definitions
 #define TN_MNV5_STEM_CONV        "v.conv_stem.conv.weight"
 #define TN_MNV5_STEM_BIAS        "v.conv_stem.conv.bias"
 #define TN_MNV5_STEM_BN          "v.conv_stem.bn.weight"
 // Stage 0 Block (Edge Residual)
 #define TN_MNV5_BLK_S0_EXP_W     "v.blk.%d.%d.conv_exp.weight"
 #define TN_MNV5_BLK_S0_BN1_W     "v.blk.%d.%d.bn1.weight"
 #define TN_MNV5_BLK_S0_PWL_W     "v.blk.%d.%d.conv_pwl.weight"
 #define TN_MNV5_BLK_S0_BN2_W     "v.blk.%d.%d.bn2.weight"
 // Stage 1+ Block (Universal Inverted Residual)
 #define TN_MNV5_BLK_DW_START_W   "v.blk.%d.%d.dw_start.conv.weight"
 #define TN_MNV5_BLK_DW_START_BN  "v.blk.%d.%d.dw_start.bn.weight"
 #define TN_MNV5_BLK_DW_MID_W     "v.blk.%d.%d.dw_mid.conv.weight"
 #define TN_MNV5_BLK_DW_MID_BN    "v.blk.%d.%d.dw_mid.bn.weight"
 #define TN_MNV5_BLK_PW_EXP_W     "v.blk.%d.%d.pw_exp.conv.weight"
 #define TN_MNV5_BLK_PW_EXP_BN    "v.blk.%d.%d.pw_exp.bn.weight"
 #define TN_MNV5_BLK_PW_PROJ_W    "v.blk.%d.%d.pw_proj.conv.weight"
 #define TN_MNV5_BLK_PW_PROJ_BN   "v.blk.%d.%d.pw_proj.bn.weight"
 #define TN_MNV5_BLK_LAYER_SCALE  "v.blk.%d.%d.layer_scale.gamma"
 // Attention Components
 #define TN_MNV5_ATTN_Q_W         "v.blk.%d.%d.attn.query.proj.weight"
 #define TN_MNV5_ATTN_K_W         "v.blk.%d.%d.attn.key.proj.weight"
 #define TN_MNV5_ATTN_V_W         "v.blk.%d.%d.attn.value.proj.weight"
 #define TN_MNV5_ATTN_O_W         "v.blk.%d.%d.attn.output.proj.weight"
 #define TN_MNV5_ATTN_K_DW        "v.blk.%d.%d.attn.key.down_conv.weight"
 #define TN_MNV5_ATTN_K_NORM      "v.blk.%d.%d.attn.key.norm.weight"
 #define TN_MNV5_ATTN_V_DW        "v.blk.%d.%d.attn.value.down_conv.weight"
 #define TN_MNV5_ATTN_V_NORM      "v.blk.%d.%d.attn.value.norm.weight"
 #define TN_MNV5_ATTN_NORM        "v.blk.%d.%d.norm.weight" // Block norm used in attn blocks
 // MSFA
 #define TN_MNV5_MSFA_FFN_EXP_W   "v.msfa.ffn.pw_exp.conv.weight"
 #define TN_MNV5_MSFA_FFN_EXP_BN  "v.msfa.ffn.pw_exp.bn.weight"
 #define TN_MNV5_MSFA_FFN_PROJ_W  "v.msfa.ffn.pw_proj.conv.weight"
 #define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight"
 #define TN_MNV5_MSFA_NORM        "v.msfa.norm.weight"
 // align x to upper multiple of n
 #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
@@ -212,8 +155,6 @@ enum projector_type {
    PROJECTOR_TYPE_QWEN2VL,
    PROJECTOR_TYPE_QWEN3VL,
    PROJECTOR_TYPE_GEMMA3,
    PROJECTOR_TYPE_GEMMA3NV,
    PROJECTOR_TYPE_GEMMA3NA,
    PROJECTOR_TYPE_IDEFICS3,
    PROJECTOR_TYPE_PIXTRAL,
    PROJECTOR_TYPE_QWEN25VL,
@@ -224,15 +165,12 @@ enum projector_type {
    PROJECTOR_TYPE_GLMA,
    PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
    PROJECTOR_TYPE_VOXTRAL,
    PROJECTOR_TYPE_MUSIC_FLAMINGO,
    PROJECTOR_TYPE_LFM2,
    PROJECTOR_TYPE_KIMIVL,
    PROJECTOR_TYPE_LIGHTONOCR,
    PROJECTOR_TYPE_COGVLM,
    PROJECTOR_TYPE_JANUS_PRO,
    PROJECTOR_TYPE_LFM2A,
    PROJECTOR_TYPE_GLM4V,
    PROJECTOR_TYPE_YOUTUVL,
    PROJECTOR_TYPE_UNKNOWN,
 };
@@ -246,8 +184,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_QWEN25VL,  "qwen2.5vl_merger"},
    { PROJECTOR_TYPE_QWEN3VL,   "qwen3vl_merger"},
    { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
    { PROJECTOR_TYPE_GEMMA3NV,  "gemma3nv"},
    { PROJECTOR_TYPE_GEMMA3NA,  "gemma3na"},
    { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
    { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
    { PROJECTOR_TYPE_ULTRAVOX,  "ultravox"},
@@ -257,15 +193,12 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_GLMA,      "glma"},
    { PROJECTOR_TYPE_QWEN25O,   "qwen2.5o"},
    { PROJECTOR_TYPE_VOXTRAL,   "voxtral"},
    { PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"},
    { PROJECTOR_TYPE_LFM2,      "lfm2"},
    { PROJECTOR_TYPE_KIMIVL,    "kimivl"},
    { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
    { PROJECTOR_TYPE_COGVLM,    "cogvlm"},
    { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
    { PROJECTOR_TYPE_LFM2A,     "lfm2a"},
    { PROJECTOR_TYPE_GLM4V,     "glm4v"},
    { PROJECTOR_TYPE_YOUTUVL,   "youtuvl"},
 };
 static projector_type clip_projector_type_from_string(const std::string & str) {
--- a/llama/llama.cpp/tools/mtmd/clip-model.h
+++ b/llama/llama.cpp/tools/mtmd/clip-model.h
@@ -4,7 +4,6 @@
 #include "clip.h"
 #include "clip-impl.h"
 #include <array>
 #include <vector>
 #include <unordered_set>
 #include <cstdint>
@@ -61,7 +60,6 @@ struct clip_hparams {
    std::unordered_set<int32_t> vision_feature_layer;
    int32_t attn_window_size = 0;
    int32_t n_wa_pattern = 0;
    std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)
    // audio
    int32_t n_mel_bins = 0; // whisper preprocessor
@@ -144,74 +142,11 @@ struct clip_layer {
    ggml_tensor * deepstack_fc2_w = nullptr;
    ggml_tensor * deepstack_fc2_b = nullptr;
    // lfm2
    ggml_tensor * ff_norm_w     = nullptr;
    ggml_tensor * ff_norm_b     = nullptr;
    ggml_tensor * ff_norm_1_w   = nullptr;
    ggml_tensor * ff_norm_1_b   = nullptr;
    ggml_tensor * ff_up_1_w     = nullptr;
    ggml_tensor * ff_up_1_b     = nullptr;
    ggml_tensor * ff_down_1_w   = nullptr;
    ggml_tensor * ff_down_1_b   = nullptr;
    ggml_tensor * pos_bias_u    = nullptr;
    ggml_tensor * pos_bias_v    = nullptr;
    ggml_tensor * norm_conv_w   = nullptr;
    ggml_tensor * norm_conv_b   = nullptr;
    ggml_tensor * linear_pos_w  = nullptr;
    ggml_tensor * conv_norm_w   = nullptr;
    ggml_tensor * conv_norm_b   = nullptr;
    ggml_tensor * conv_dw_w     = nullptr;
    ggml_tensor * conv_dw_b     = nullptr;
    ggml_tensor * conv_pw1_w    = nullptr;
    ggml_tensor * conv_pw1_b    = nullptr;
    ggml_tensor * conv_pw2_w    = nullptr;
    ggml_tensor * conv_pw2_b    = nullptr;
    bool has_deepstack() const {
        return deepstack_fc1_w != nullptr;
    }
 };
 // Expanded MobileNetV5 block structure for Gemma3n vision encoder
 struct mobilenetv5_block {
    // Stage 0 (Edge Residual)
    ggml_tensor * s0_conv_exp_w = nullptr;
    ggml_tensor * s0_bn1_w      = nullptr;
    ggml_tensor * s0_conv_pwl_w = nullptr;
    ggml_tensor * s0_bn2_w      = nullptr;
    // Stage 1+ (Universal Inverted Residual)
    ggml_tensor * dw_start_w    = nullptr;
    ggml_tensor * dw_start_bn_w = nullptr;
    ggml_tensor * pw_exp_w      = nullptr;
    ggml_tensor * pw_exp_bn_w   = nullptr;
    ggml_tensor * dw_mid_w      = nullptr;
    ggml_tensor * dw_mid_bn_w   = nullptr;
    ggml_tensor * pw_proj_w     = nullptr;
    ggml_tensor * pw_proj_bn_w  = nullptr;
    ggml_tensor * layer_scale_w = nullptr;
    // Attention (MQA) components
    ggml_tensor * attn_q_w = nullptr;
    ggml_tensor * attn_k_w = nullptr;
    ggml_tensor * attn_v_w = nullptr;
    ggml_tensor * attn_o_w = nullptr;
    // Optional downsampling/norm in attention
    ggml_tensor * attn_k_dw_w   = nullptr;
    ggml_tensor * attn_k_norm_w = nullptr;
    ggml_tensor * attn_v_dw_w   = nullptr;
    ggml_tensor * attn_v_norm_w = nullptr;
    // Block norm (often present in attention blocks)
    ggml_tensor * attn_norm_w   = nullptr;
 };
 struct clip_model {
    clip_modality modality = CLIP_MODALITY_VISION;
    projector_type proj_type = PROJECTOR_TYPE_MLP;
@@ -328,23 +263,6 @@ struct clip_model {
    ggml_tensor * mm_input_proj_w = nullptr;
    ggml_tensor * mm_soft_emb_norm_w = nullptr;
    // mobilenetv5 for gemma3n
    std::vector<mobilenetv5_block> mobilenet_blocks;
    std::vector<int> mobilenet_stage_ends;
    ggml_tensor * mobilenet_stem_conv_w = nullptr;
    ggml_tensor * mobilenet_stem_conv_b = nullptr;
    ggml_tensor * mobilenet_stem_norm_w = nullptr;
    ggml_tensor * mm_post_proj_norm_w = nullptr;
    // Multi-Scale Fusion Adapter (MSFA) components
    ggml_tensor * msfa_concat_conv_w = nullptr;
    ggml_tensor * msfa_concat_norm_w = nullptr;
    ggml_tensor * msfa_ffn_expand_w = nullptr;
    ggml_tensor * msfa_ffn_project_w = nullptr;
    ggml_tensor * msfa_ffn_expand_bn = nullptr;
    ggml_tensor * msfa_ffn_project_bn = nullptr;
    // pixtral, glm4v
    ggml_tensor * token_embd_img_break = nullptr;
    ggml_tensor * mm_patch_merger_w = nullptr;
@@ -368,16 +286,9 @@ struct clip_model {
    ggml_tensor * mm_boi = nullptr;
    ggml_tensor * mm_eoi = nullptr;
    // lfm2 audio
    std::array<ggml_tensor *, 7> pre_encode_conv_X_w = {nullptr};
    std::array<ggml_tensor *, 7> pre_encode_conv_X_b = {nullptr};
    ggml_tensor * pre_encode_out_w = nullptr;
    ggml_tensor * pre_encode_out_b = nullptr;
    bool audio_has_avgpool() const {
        return proj_type == PROJECTOR_TYPE_QWEN2A
-            || proj_type == PROJECTOR_TYPE_VOXTRAL
+            || proj_type == PROJECTOR_TYPE_VOXTRAL;
            || proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO;
    }
    bool audio_has_stack_frames() const {
--- a/llama/llama.cpp/tools/mtmd/clip.cpp
+++ b/llama/llama.cpp/tools/mtmd/clip.cpp
@@ -165,14 +165,18 @@ struct clip_ctx {
    ggml_backend_t backend_cpu = nullptr;
    ggml_backend_buffer_ptr buf;
    int max_nodes = 8192;
    ggml_backend_sched_ptr sched;
    clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
    bool is_allocated = false;
    // for debugging
    bool debug_graph = false;
    std::vector<ggml_tensor *> debug_print_tensors;
    clip_ctx(clip_context_params & ctx_params) {
        flash_attn_type = ctx_params.flash_attn_type;
        debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr;
        backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
        if (!backend_cpu) {
            throw std::runtime_error("failed to initialize CPU backend");
@@ -213,10 +217,6 @@ struct clip_ctx {
        sched.reset(
            ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true)
        );
        if (ctx_params.cb_eval != nullptr) {
            ggml_backend_sched_set_eval_callback(sched.get(), ctx_params.cb_eval, ctx_params.cb_eval_user_data);
        }
    }
    ~clip_ctx() {
@@ -252,7 +252,9 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
        n_mmproj_embd(clip_n_mmproj_embd(ctx)),
        eps(hparams.eps),
        kq_scale(1.0f / sqrtf((float)d_head)),
-        flash_attn_type(ctx->flash_attn_type) {
+        flash_attn_type(ctx->flash_attn_type),
        debug_graph(ctx->debug_graph),
        debug_print_tensors(ctx->debug_print_tensors) {
    struct ggml_init_params params = {
        /*.mem_size   =*/ ctx->buf_compute_meta.size(),
        /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
@@ -263,11 +265,14 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
    gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
 }
-void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const {
+void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const {
-    if (il >= 0) {
+    if (debug_graph) {
-        ggml_format_name(cur, "%s-%d", name, il);
+        ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0));
-    } else {
+        std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name;
-        ggml_set_name(cur, name);
+        ggml_set_name(cur, cur_name.c_str());
        ggml_set_output(cur);
        ggml_build_forward_expand(gf, cur);
        debug_print_tensors.push_back(cur);
    }
 }
@@ -796,10 +801,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            {
                builder = std::make_unique<clip_graph_siglip>(ctx, img);
            } break;
        case PROJECTOR_TYPE_GEMMA3NV:
            {
                builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
            } break;
        case PROJECTOR_TYPE_PIXTRAL:
        case PROJECTOR_TYPE_LIGHTONOCR:
            {
@@ -830,7 +831,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        case PROJECTOR_TYPE_VOXTRAL:
        case PROJECTOR_TYPE_QWEN2A:
        case PROJECTOR_TYPE_GLMA:
        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
            {
                builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
            } break;
@@ -850,18 +850,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            {
                builder = std::make_unique<clip_graph_llava>(ctx, img);
            } break;
        case PROJECTOR_TYPE_LFM2A:
            {
                builder = std::make_unique<clip_graph_conformer>(ctx, img);
            } break;
        case PROJECTOR_TYPE_GLM4V:
            {
                builder = std::make_unique<clip_graph_glm4v>(ctx, img);
            } break;
        case PROJECTOR_TYPE_YOUTUVL:
            {
                builder = std::make_unique<clip_graph_youtuvl>(ctx, img);
            } break;
        default:
            GGML_ABORT("missing cgraph builder");
    }
@@ -1162,14 +1154,6 @@ struct clip_model_loader {
                        // test model (tinygemma3) has a different value, we optionally read it
                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
                    } break;
                case PROJECTOR_TYPE_GEMMA3NV:
                    {
                        // Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
                        // Similar configuration to Gemma3
                        hparams.n_merge = 1;  // MobileNetV5 handles resizing internally
                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
                    } break;
                case PROJECTOR_TYPE_QWEN2VL:
                case PROJECTOR_TYPE_QWEN25VL:
                case PROJECTOR_TYPE_QWEN3VL:
@@ -1187,20 +1171,6 @@ struct clip_model_loader {
                            LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
                        }
                    } break;
                case PROJECTOR_TYPE_YOUTUVL:
                    {
                        hparams.n_merge = 2;
                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
                        get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
                        std::vector<int> wa_layer_indexes_vec;
                        get_arr_int(KEY_WIN_ATTN_LAYER_INDEXES, wa_layer_indexes_vec, true);
                        for (auto & layer : wa_layer_indexes_vec) {
                            hparams.wa_layer_indexes.insert(layer);
                        }
                        // support max_height * max_width = 8000 * 8000. 8000/16/2 = 250 image tokens
                        hparams.set_limit_image_tokens(1, 62500);
                        hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup
                    } break;
                case PROJECTOR_TYPE_GLM4V:
                    {
                        hparams.rope_theta = 10000.0f;
@@ -1219,7 +1189,6 @@ struct clip_model_loader {
                case PROJECTOR_TYPE_QWEN2A:
                case PROJECTOR_TYPE_GLMA:
                case PROJECTOR_TYPE_VOXTRAL:
                case PROJECTOR_TYPE_MUSIC_FLAMINGO:
                    {
                        bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
                                             model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
@@ -1235,15 +1204,6 @@ struct clip_model_loader {
                        hparams.audio_window_len   = 400;
                        hparams.audio_hop_len      = 160;
                    } break;
                case PROJECTOR_TYPE_LFM2A:
                    {
                        // audio preprocessing params
                        hparams.audio_chunk_len        = 1; // in seconds
                        hparams.audio_sample_rate      = 16000;
                        hparams.audio_n_fft            = 512;
                        hparams.audio_window_len       = 400;
                        hparams.audio_hop_len          = 160;
                    } break;
                default:
                    break;
            }
@@ -1269,14 +1229,7 @@ struct clip_model_loader {
                LOG_INF("%s: has_llava_proj:     %d\n", __func__, hparams.has_llava_projector);
                LOG_INF("%s: minicpmv_version:   %d\n", __func__, hparams.minicpmv_version);
                LOG_INF("%s: n_merge:            %d\n", __func__, hparams.n_merge);
-                LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
+                LOG_INF("%s: n_wa_pattern:       %d\n", __func__, hparams.n_wa_pattern);
                if (!hparams.wa_layer_indexes.empty()) {
                    LOG_INF("%s: wa_layer_indexes:  ", __func__);
                    for (auto & layer : hparams.wa_layer_indexes) {
                        LOG_INF("%d ", layer);
                    }
                    LOG_INF("\n");
                }
                if (hparams.image_min_pixels > 0) {
                    LOG_INF("%s: image_min_pixels:   %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : "");
                }
@@ -1358,10 +1311,6 @@ struct clip_model_loader {
        model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
        if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) {
            hparams.n_layer = 0; // gemma3n does not use normal layer structure
        }
        // layers
        model.layers.resize(hparams.n_layer);
        for (int il = 0; il < hparams.n_layer; ++il) {
@@ -1436,7 +1385,6 @@ struct clip_model_loader {
            }
        }
        switch (model.proj_type) {
            case PROJECTOR_TYPE_MLP:
            case PROJECTOR_TYPE_MLP_NORM:
@@ -1531,8 +1479,8 @@ struct clip_model_loader {
                    model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
                    model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
                    model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
-                    model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI));
+                    model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
-                    model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI));
+                    model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
                } break;
            case PROJECTOR_TYPE_QWEN2VL:
            case PROJECTOR_TYPE_QWEN25VL:
@@ -1549,14 +1497,6 @@ struct clip_model_loader {
                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
                } break;
            case PROJECTOR_TYPE_YOUTUVL:
                {
                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);        // merger.ln_q (RMS norm)
                    model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));  // merger.mlp.0
                    model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));  // merger.mlp.2
                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
                } break;
            case PROJECTOR_TYPE_GLM4V:
                {
                    model.projection     = get_tensor(TN_MM_PROJECTOR);
@@ -1576,112 +1516,11 @@ struct clip_model_loader {
                    model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
                    model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
                } break;
            case PROJECTOR_TYPE_GEMMA3NV:
                {
                    model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
                    model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false);
                    model.mobilenet_stem_norm_w = get_tensor(TN_MNV5_STEM_BN, false);
                    model.msfa_ffn_expand_w  = get_tensor(TN_MNV5_MSFA_FFN_EXP_W, false);
                    model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded
                    model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false);
                    model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false);
                    model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false);
                    // Dynamically load blocks stage by stage
                    for (int stage = 0; stage < 4; ++stage) {
                        int blocks_found_in_stage = 0;
                        for (int blk_idx = 0; ; ++blk_idx) {
                            bool found_block = false;
                            mobilenetv5_block block;
                            // 1. Check for Edge Residual (S0)
                            block.s0_conv_exp_w = get_tensor(string_format(TN_MNV5_BLK_S0_EXP_W, stage, blk_idx), false);
                            if (block.s0_conv_exp_w) {
                                found_block = true;
                                block.s0_bn1_w      = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false);
                                block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false);
                                block.s0_bn2_w      = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false);
                            }
                            // 2. Check for UIR (Universal Inverted Residual)
                            else {
                                // Check for dw_start OR pw_exp (some UIR blocks skip dw_start)
                                block.dw_start_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_W, stage, blk_idx), false);
                                block.pw_exp_w   = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_W, stage, blk_idx), false);
                                if (block.dw_start_w || block.pw_exp_w) {
                                    found_block = true;
                                    if (block.dw_start_w) {
                                        block.dw_start_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_BN, stage, blk_idx), false);
                                    }
                                    if (block.pw_exp_w) {
                                        block.pw_exp_bn_w   = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_BN, stage, blk_idx), false);
                                    }
                                    block.dw_mid_w      = get_tensor(string_format(TN_MNV5_BLK_DW_MID_W, stage, blk_idx), false);
                                    if (block.dw_mid_w) {
                                        block.dw_mid_bn_w   = get_tensor(string_format(TN_MNV5_BLK_DW_MID_BN, stage, blk_idx), false);
                                    }
                                    block.pw_proj_w     = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_W, stage, blk_idx), false);
                                    if (block.pw_proj_w) {
                                        block.pw_proj_bn_w  = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_BN, stage, blk_idx), false);
                                    }
                                    block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
                                }
                            }
                            // 3. Check for Attention (MQA)
                            // Even if UIR/Edge check failed, this might be a pure attention block
                            ggml_tensor* attn_q_check = get_tensor(string_format(TN_MNV5_ATTN_Q_W, stage, blk_idx), false);
                            if (attn_q_check) {
                                found_block = true;
                                block.attn_q_w = attn_q_check;
                                block.attn_k_w = get_tensor(string_format(TN_MNV5_ATTN_K_W, stage, blk_idx), false);
                                block.attn_v_w = get_tensor(string_format(TN_MNV5_ATTN_V_W, stage, blk_idx), false);
                                block.attn_o_w = get_tensor(string_format(TN_MNV5_ATTN_O_W, stage, blk_idx), false);
                                block.attn_k_dw_w   = get_tensor(string_format(TN_MNV5_ATTN_K_DW, stage, blk_idx), false);
                                block.attn_k_norm_w = get_tensor(string_format(TN_MNV5_ATTN_K_NORM, stage, blk_idx), false);
                                block.attn_v_dw_w   = get_tensor(string_format(TN_MNV5_ATTN_V_DW, stage, blk_idx), false);
                                block.attn_v_norm_w = get_tensor(string_format(TN_MNV5_ATTN_V_NORM, stage, blk_idx), false);
                                block.attn_norm_w   = get_tensor(string_format(TN_MNV5_ATTN_NORM, stage, blk_idx), false);
                                // Note: Attention blocks also have layer_scale, load it if not already loaded by UIR check
                                if (!block.layer_scale_w) {
                                    block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
                                }
                            }
                            if (found_block) {
                                model.mobilenet_blocks.push_back(block);
                                blocks_found_in_stage++;
                            } else {
                                // End of blocks for this stage
                                break;
                            }
                        }
                        // Track where this stage ends in the flat vector
                        if (blocks_found_in_stage > 0) {
                            model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1);
                            LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1);
                        }
                    }
                    model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
                    model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
                } break;
            case PROJECTOR_TYPE_IDEFICS3:
                {
                    model.projection = get_tensor(TN_MM_PROJECTOR);
                } break;
            case PROJECTOR_TYPE_LFM2:
                {
                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
                    model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B, false);
                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
                    model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
                    model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
                } break;
            case PROJECTOR_TYPE_KIMIVL:
                {
                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
@@ -1741,17 +1580,6 @@ struct clip_model_loader {
                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
                    model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
                } break;
            case PROJECTOR_TYPE_MUSIC_FLAMINGO:
                {
                    model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
                    model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
                    model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
                    model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
                    model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
                    model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
                    model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
                } break;
            case PROJECTOR_TYPE_INTERNVL:
                {
                    model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
@@ -1773,8 +1601,8 @@ struct clip_model_loader {
                    model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
                    model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
                    model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias"));
-                    model.mm_boi = get_tensor(string_format(TN_TOK_BOI));
+                    model.mm_boi = get_tensor(string_format(TN_TOK_BOI, "weight"));
-                    model.mm_eoi = get_tensor(string_format(TN_TOK_EOI));
+                    model.mm_eoi = get_tensor(string_format(TN_TOK_EOI, "weight"));
                } break;
            case PROJECTOR_TYPE_LLAMA4:
                {
@@ -1800,52 +1628,6 @@ struct clip_model_loader {
                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
                } break;
            case PROJECTOR_TYPE_LFM2A:
                {
                    for (int i : {0, 2, 3, 5, 6}) {
                        model.pre_encode_conv_X_w[i] = get_tensor(string_format(TN_CONV1D, i, "weight"));
                        model.pre_encode_conv_X_b[i] = get_tensor(string_format(TN_CONV1D, i, "bias"));
                    }
                    model.pre_encode_out_w    = get_tensor(string_format(TN_PRE_ENCODE_OUT, "weight"));
                    model.pre_encode_out_b    = get_tensor(string_format(TN_PRE_ENCODE_OUT, "bias"));
                    model.mm_0_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "weight"));
                    model.mm_0_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "bias"));
                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
                    model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
                    model.mm_3_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "weight"));
                    model.mm_3_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "bias"));
                    for (int il = 0; il < hparams.n_layer; ++il) {
                        auto & layer = model.layers[il];
                        layer.ff_norm_w   = get_tensor(string_format(TN_FFN_NORM,   prefix, il, "weight"));
                        layer.ff_norm_b   = get_tensor(string_format(TN_FFN_NORM,   prefix, il, "bias"));
                        layer.ff_norm_1_w = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "weight"));
                        layer.ff_norm_1_b = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "bias"));
                        layer.ff_up_1_w   = get_tensor(string_format(TN_FFN_UP_1,   prefix, il, "weight"));
                        layer.ff_up_1_b   = get_tensor(string_format(TN_FFN_UP_1,   prefix, il, "bias"));
                        layer.ff_down_1_w = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "weight"));
                        layer.ff_down_1_b = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "bias"));
                        layer.pos_bias_u = get_tensor(string_format(TN_POS_BIAS_U, prefix, il));
                        layer.pos_bias_v = get_tensor(string_format(TN_POS_BIAS_V, prefix, il));
                        layer.norm_conv_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight"));
                        layer.norm_conv_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias"));
                        layer.linear_pos_w = get_tensor(string_format(TN_LINEAR_POS, prefix, il, "weight"));
                        layer.conv_norm_w  = get_tensor(string_format(TN_CONV_NORM, prefix, il, "weight"));
                        layer.conv_norm_b  = get_tensor(string_format(TN_CONV_NORM, prefix, il, "bias"));
                        layer.conv_dw_w    = get_tensor(string_format(TN_CONV_DW,   prefix, il, "weight"));
                        layer.conv_dw_b    = get_tensor(string_format(TN_CONV_DW,   prefix, il, "bias"));
                        layer.conv_pw1_w   = get_tensor(string_format(TN_CONV_PW1,  prefix, il, "weight"));
                        layer.conv_pw1_b   = get_tensor(string_format(TN_CONV_PW1,  prefix, il, "bias"));
                        layer.conv_pw2_w   = get_tensor(string_format(TN_CONV_PW2,  prefix, il, "weight"));
                        layer.conv_pw2_b   = get_tensor(string_format(TN_CONV_PW2,  prefix, il, "bias"));
                    }
                } break;
            default:
                GGML_ASSERT(false && "unknown projector type");
        }
@@ -2150,7 +1932,6 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
    try {
        clip_model_loader loader(fname);
        bool skip_audio = false;
        if (loader.has_vision) {
            ctx_vision = new clip_ctx(ctx_params);
@@ -2160,14 +1941,10 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
                loader.warmup(*ctx_vision);
            }
            // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
            // we can remove this check when we implement audio support for Gemma 3N
            skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
            // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
        }
-        if (loader.has_audio && !skip_audio) {
+        if (loader.has_audio) {
            ctx_audio = new clip_ctx(ctx_params);
            loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
            loader.load_tensors(*ctx_audio);
@@ -2290,7 +2067,7 @@ struct img_tool {
            std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
        dst.nx = target_resolution.width;
        dst.ny = target_resolution.height;
-        dst.buf.resize(3 * static_cast<size_t>(dst.nx) * static_cast<size_t>(dst.ny));
+        dst.buf.resize(3 * dst.nx * dst.ny);
        if (dst.nx == src.nx && dst.ny == src.ny) {
            // no resize needed, simple copy
@@ -2343,7 +2120,7 @@ struct img_tool {
    static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
        dst.nx = w;
        dst.ny = h;
-        dst.buf.resize(3 * static_cast<size_t>(w) * static_cast<size_t>(h));
+        dst.buf.resize(3 * w * h);
        for (int i = 0; i < h; ++i) {
            for (int j = 0; j < w; ++j) {
@@ -2440,7 +2217,7 @@ private:
    static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
        dst.nx = target_width;
        dst.ny = target_height;
-        dst.buf.resize(3 * static_cast<size_t>(target_width) * static_cast<size_t>(target_height));
+        dst.buf.resize(3 * target_width * target_height);
        float x_ratio = static_cast<float>(src.nx - 1) / target_width;
        float y_ratio = static_cast<float>(src.ny - 1) / target_height;
@@ -2479,7 +2256,7 @@ private:
        dst.nx = target_width;
        dst.ny = target_height;
-        dst.buf.resize(3 * static_cast<size_t>(target_width) * static_cast<size_t>(target_height));
+        dst.buf.resize(3 * target_width * target_height);
        float Cc;
        float C[5] = {};
@@ -2891,57 +2668,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                // res_imgs->data[0] = *res;
                res_imgs->entries.push_back(std::move(img_f32));
            } break;
        case PROJECTOR_TYPE_YOUTUVL:
            {
                const int patch_size = params.patch_size;  // typically 16
                const int merge_size = params.n_merge;      // typically 2
                const int align_size = patch_size * merge_size;  // 32
                const int max_num_patches = params.image_max_pixels > 0 ?
                    params.image_max_pixels / (patch_size * patch_size) : 256;
                // Linear search for optimal scale to fit within max_num_patches
                float scale = 1.0f;
                int target_height = original_size.height;
                int target_width = original_size.width;
                auto get_scaled_image_size = [align_size](float scale, int size) -> int {
                    float scaled_size = size * scale;
                    // Round up to nearest multiple of align_size
                    int aligned = static_cast<int>(std::ceil(scaled_size / align_size)) * align_size;
                    // Ensure at least one patch
                    return std::max(align_size, aligned);
                };
                // Linear search with 0.02 step size
                while (scale > 0.0f) {
                    target_height = get_scaled_image_size(scale, original_size.height);
                    target_width = get_scaled_image_size(scale, original_size.width);
                    int num_patches_h = target_height / patch_size;
                    int num_patches_w = target_width / patch_size;
                    int num_patches = num_patches_h * num_patches_w;
                    if (num_patches > max_num_patches) {
                        scale -= 0.02f;
                    } else {
                        break;
                    }
                }
                clip_image_size new_size = {target_width, target_height};
                // Resize the image
                clip_image_u8 resized;
                img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
                // Normalize to float32
                clip_image_f32_ptr img_f32(clip_image_f32_init());
                normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
                // Add to results
                res_imgs->entries.push_back(std::move(img_f32));
            } break;
        case PROJECTOR_TYPE_IDEFICS3:
            {
@@ -3005,16 +2731,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                res_imgs->entries.push_back(std::move(img_f32));
            } break;
        case PROJECTOR_TYPE_GEMMA3NV:
            {
                clip_image_u8 resized_image;
                int sz = params.image_size;
                img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false);
                clip_image_f32_ptr img_f32(clip_image_f32_init());
                normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
                res_imgs->entries.push_back(std::move(img_f32));
            } break;
        case PROJECTOR_TYPE_JANUS_PRO:
            {
                // Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
@@ -3184,7 +2900,6 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_QWEN3VL:
        case PROJECTOR_TYPE_GLM4V:
        case PROJECTOR_TYPE_YOUTUVL:
            return (img->nx / params.patch_size) / 2;
        default:
            break;
@@ -3200,7 +2915,6 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_QWEN3VL:
        case PROJECTOR_TYPE_GLM4V:
        case PROJECTOR_TYPE_YOUTUVL:
            return (img->ny / params.patch_size) / 2;
        default:
            break;
@@ -3261,7 +2975,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_QWEN3VL:
        case PROJECTOR_TYPE_GLM4V:
        case PROJECTOR_TYPE_YOUTUVL:
            {
                // dynamic size (2 conv, so double patch size)
                int x_patch = img->nx / (params.patch_size * 2);
@@ -3277,12 +2990,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
                int scale_factor = ctx->model.hparams.n_merge;
                n_patches /= (scale_factor * scale_factor);
            } break;
        case PROJECTOR_TYPE_GEMMA3NV:
            {
                // MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution
                // regardless of input size (see architecture description)
                n_patches = ctx->model.hparams.image_size / ctx->model.hparams.patch_size;
            } break;
        case PROJECTOR_TYPE_LFM2:
        case PROJECTOR_TYPE_KIMIVL:
            {
@@ -3308,7 +3015,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
        case PROJECTOR_TYPE_VOXTRAL:
        case PROJECTOR_TYPE_ULTRAVOX:
        case PROJECTOR_TYPE_QWEN2A:
        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
            {
                n_patches = img->nx;
@@ -3341,10 +3047,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
            {
                n_patches += 2; // for BOI and EOI token embeddings
            } break;
        case PROJECTOR_TYPE_LFM2A:
            {
                n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
            } break;
        default:
            GGML_ABORT("unsupported projector type");
    }
@@ -3377,6 +3079,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    }
    // build the inference graph
    ctx->debug_print_tensors.clear();
    ggml_backend_sched_reset(ctx->sched.get());
    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
    ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
@@ -3394,6 +3097,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    const int pos_w = image_size_width  / patch_size;
    const int pos_h = image_size_height / patch_size;
    const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
    auto get_inp_tensor = [&gf](const char * name) {
        ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
@@ -3542,11 +3246,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                set_input_i32("positions", positions);
            } break;
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_YOUTUVL:
            {
                // pw * ph = number of tokens output by ViT after apply patch merger
                // ipw * ipw = number of vision token been processed inside ViT
                const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty();
                const int merge_ratio = 2;
                const int pw  = image_size_width  / patch_size / merge_ratio;
                const int ph  = image_size_height / patch_size / merge_ratio;
@@ -3557,7 +3259,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                std::vector<int> inv_idx(ph * pw);
                if (use_window_attn) {
-                    const int attn_window_size = hparams.attn_window_size > 0 ? hparams.attn_window_size : 112;
+                    const int attn_window_size = 112;
                    const int grid_window = attn_window_size / patch_size / merge_ratio;
                    int dst = 0;
                    // [num_vision_tokens, num_vision_tokens] attention mask tensor
@@ -3674,7 +3376,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                set_input_i32("patches", patches);
            } break;
        case PROJECTOR_TYPE_GEMMA3:
        case PROJECTOR_TYPE_GEMMA3NV:
        case PROJECTOR_TYPE_IDEFICS3:
        case PROJECTOR_TYPE_INTERNVL:
        case PROJECTOR_TYPE_QWEN2A:
@@ -3682,7 +3383,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        case PROJECTOR_TYPE_ULTRAVOX:
        case PROJECTOR_TYPE_LFM2:
        case PROJECTOR_TYPE_VOXTRAL:
        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
        case PROJECTOR_TYPE_JANUS_PRO:
        case PROJECTOR_TYPE_COGVLM:
            {
@@ -3705,27 +3405,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                }
                set_input_i32("pos_w", pos_data);
            } break;
        case PROJECTOR_TYPE_LFM2A:
            {
                GGML_ASSERT(imgs.entries.size() == 1);
                const auto n_frames = clip_n_output_tokens(ctx, imgs.entries.front().get());
                auto d_model = 512;
                auto seq_len = n_frames * 2 - 1;
                std::vector<float> pos_emb(d_model*seq_len);
                std::vector<double> inv_freq(d_model / 2);
                for (size_t i = 0; i < inv_freq.size(); ++i) {
                    inv_freq[i] = std::exp(-(std::log(10000.0) / (float)d_model) * (2.0f * (float)(i)));
                }
                for (int64_t pos = 0; pos < seq_len; ++pos) {
                    for (size_t i = 0; i < inv_freq.size(); ++i) {
                        const float ang = (n_frames - pos - 1) * inv_freq[i];
                        pos_emb[pos*d_model + 2*i + 0] = sinf(ang);  // even
                        pos_emb[pos*d_model + 2*i + 1] = cosf(ang);  // odd
                    }
                }
                set_input_f32("pos_emb", pos_emb);
            } break;
        default:
            GGML_ABORT("Unknown projector type");
    }
@@ -3746,6 +3425,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        return false;
    }
    // print debug nodes
    if (ctx->debug_graph) {
        LOG_INF("\n\n---\n\n");
        LOG_INF("\n\nDebug graph:\n\n");
        for (ggml_tensor * t : ctx->debug_print_tensors) {
            std::vector<uint8_t> data(ggml_nbytes(t));
            ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
            print_tensor_shape(t);
            print_tensor_data(t, data.data(), 3);
        }
    }
    // the last node is the embedding tensor
    ggml_tensor * embeddings = ggml_graph_node(gf, -1);
@@ -3784,19 +3475,16 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
        case PROJECTOR_TYPE_QWEN2VL:
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_JANUS_PRO:
        case PROJECTOR_TYPE_YOUTUVL:
            return ctx->model.mm_1_b->ne[0];
        case PROJECTOR_TYPE_QWEN3VL:
            // main path + deepstack paths
            return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
        case PROJECTOR_TYPE_GEMMA3:
        case PROJECTOR_TYPE_GEMMA3NV:
            return ctx->model.mm_input_proj_w->ne[0];
        case PROJECTOR_TYPE_IDEFICS3:
            return ctx->model.projection->ne[1];
        case PROJECTOR_TYPE_ULTRAVOX:
        case PROJECTOR_TYPE_VOXTRAL:
        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
            return ctx->model.mm_2_w->ne[1];
        case PROJECTOR_TYPE_INTERNVL:
            return ctx->model.mm_3_w->ne[1];
@@ -3811,8 +3499,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
            return ctx->model.mm_2_w->ne[1];
        case PROJECTOR_TYPE_COGVLM:
            return ctx->model.mm_4h_to_h_w->ne[1];
        case PROJECTOR_TYPE_LFM2A:
            return ctx->model.position_embeddings->ne[0];
        case PROJECTOR_TYPE_GLM4V:
            return ctx->model.mm_ffn_down_w->ne[1];
        default:
@@ -3821,7 +3507,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
 }
 int clip_is_minicpmv(const struct clip_ctx * ctx) {
    // TODO: remove this function
    if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
        return ctx->model.hparams.minicpmv_version;
    }
@@ -3829,14 +3514,24 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
 }
 bool clip_is_glm(const struct clip_ctx * ctx) {
    // TODO: remove this function
    return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
 }
 bool clip_is_mrope(const struct clip_ctx * ctx) {
    return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
        || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
        || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL
        || ctx->proj_type() == PROJECTOR_TYPE_GLM4V;
 }
 bool clip_is_llava(const struct clip_ctx * ctx) {
    return ctx->model.hparams.has_llava_projector;
 }
 bool clip_is_gemma3(const struct clip_ctx * ctx) {
    return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
 }
 bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
    return ctx->model.modality == CLIP_MODALITY_VISION;
 }
@@ -3846,16 +3541,10 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
 }
 bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
-    switch (ctx->proj_type()) {
+    return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
-        case PROJECTOR_TYPE_ULTRAVOX:
+        || ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
-        case PROJECTOR_TYPE_QWEN2A:
+        || ctx->proj_type() == PROJECTOR_TYPE_GLMA
-        case PROJECTOR_TYPE_GLMA:
+        || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
        case PROJECTOR_TYPE_VOXTRAL:
        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
            return true;
        default:
            return false;
    }
 }
 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
@@ -3897,6 +3586,7 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
 //
 // API for debugging
 //
 void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
    clip_image_f32 img;
    img.nx = w;
@@ -3905,6 +3595,9 @@ void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
    for (int i = 0; i < h * w * 3; i++) {
        img.buf[i] = static_cast<float>(fill_value);
    }
    bool cur_debug_graph = ctx->debug_graph;
    ctx->debug_graph = true;
    clip_image_encode(ctx, 1, &img, nullptr);
    ctx->debug_graph = cur_debug_graph;
    GGML_ASSERT(img.buf.empty() && "expected, always stop here");
 }
--- a/llama/llama.cpp/tools/mtmd/clip.h
+++ b/llama/llama.cpp/tools/mtmd/clip.h
@@ -1,7 +1,6 @@
 #pragma once
 #include "ggml.h"
 #include "mtmd.h"
 #include <stddef.h>
 #include <stdint.h>
@@ -38,8 +37,6 @@ struct clip_context_params {
    int image_min_tokens;
    int image_max_tokens;
    bool warmup;
    ggml_backend_sched_eval_callback cb_eval;
    void * cb_eval_user_data;
 };
 struct clip_init_result {
@@ -107,9 +104,9 @@ bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct
 int clip_is_minicpmv(const struct clip_ctx * ctx);
 bool clip_is_glm(const struct clip_ctx * ctx);
 bool clip_is_mrope(const struct clip_ctx * ctx);
 bool clip_is_llava(const struct clip_ctx * ctx);
-// note for contributor: this clip_is_(model) pattern is deprecated
+bool clip_is_gemma3(const struct clip_ctx * ctx);
 //                       do NOT add new functions like this
 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
--- a/llama/llama.cpp/tools/mtmd/models/conformer.cpp
+++ b/llama/llama.cpp/tools/mtmd/models/conformer.cpp
@@ -1,216 +0,0 @@
 #include "models.h"
 ggml_cgraph * clip_graph_conformer::build() {
    const int n_frames   = img.nx;
    const int n_pos      = n_frames / 2;
    const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1;
    GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
    ggml_tensor * pos_emb = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 512, n_pos_embd);
    ggml_set_name(pos_emb, "pos_emb");
    ggml_set_input(pos_emb);
    ggml_build_forward_expand(gf, pos_emb);
    ggml_tensor * inp = build_inp_raw(1);
    auto * cur = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
    // pre encode, conv subsampling
    {
        // layer.0 - conv2d
        cur = ggml_conv_2d(ctx0, model.pre_encode_conv_X_w[0], cur, 2, 2, 1, 1, 1, 1);
        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[0]);
        cb(cur, "conformer.pre_encode.conv.{}", 0);
        // layer.1 - relu
        cur = ggml_relu_inplace(ctx0, cur);
        // layer.2 conv2d dw
        cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[2], cur, 2, 2, 1, 1, 1, 1);
        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[2]);
        cb(cur, "conformer.pre_encode.conv.{}", 2);
        // layer.3 conv2d
        cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[3], cur, 1, 1, 0, 0, 1, 1);
        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[3]);
        cb(cur, "conformer.pre_encode.conv.{}", 3);
        // layer.4 - relu
        cur = ggml_relu_inplace(ctx0, cur);
        // layer.5 conv2d dw
        cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[5], cur, 2, 2, 1, 1, 1, 1);
        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[5]);
        cb(cur, "conformer.pre_encode.conv.{}", 5);
        // layer.6 conv2d
        cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[6], cur, 1, 1, 0, 0, 1, 1);
        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[6]);
        cb(cur, "conformer.pre_encode.conv.{}", 6);
        // layer.7 - relu
        cur = ggml_relu_inplace(ctx0, cur);
        // flatten channel and frequency axis
        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
        cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]);
        // calculate out
        cur = ggml_mul_mat(ctx0, model.pre_encode_out_w, cur);
        cur = ggml_add(ctx0, cur, model.pre_encode_out_b);
        cb(cur, "conformer.pre_encode.out", -1);
    }
    // pos_emb
    cb(pos_emb, "pos_emb", -1);
    for (int il = 0; il < hparams.n_layer; il++) {
        const auto & layer = model.layers[il];
        auto * residual = cur;
        cb(cur, "layer.in", il);
        // feed_forward1
        cur = build_norm(cur, layer.ff_norm_w, layer.ff_norm_b, NORM_TYPE_NORMAL, 1e-5, il);
        cb(cur, "conformer.layers.{}.norm_feed_forward1", il);
        cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, FFN_SILU,
                        il);
        cb(cur, "conformer.layers.{}.feed_forward1.linear2", il);
        const auto fc_factor = 0.5f;
        residual             = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor));
        // self-attention
        {
            cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il);
            cb(cur, "conformer.layers.{}.norm_self_att", il);
            ggml_tensor * Qcur     = ggml_mul_mat(ctx0, layer.q_w, cur);
            Qcur                   = ggml_add(ctx0, Qcur, layer.q_b);
            Qcur                   = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]);
            ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u);
            Q_bias_u               = ggml_permute(ctx0, Q_bias_u, 0, 2, 1, 3);
            ggml_tensor * Q_bias_v = ggml_add(ctx0, Qcur, layer.pos_bias_v);
            Q_bias_v               = ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3);
            // TODO @ngxson : some cont can/should be removed when ggml_mul_mat support these cases
            ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
            Kcur               = ggml_add(ctx0, Kcur, layer.k_b);
            Kcur               = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]);
            Kcur               = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
            ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
            Vcur               = ggml_add(ctx0, Vcur, layer.v_b);
            Vcur               = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]);
            Vcur               = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 2, 0, 3));
            // build_attn won't fit due to matrix_ac and matrix_bd separation
            ggml_tensor * matrix_ac = ggml_mul_mat(ctx0, Q_bias_u, Kcur);
            matrix_ac               = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3));
            cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il);
            auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb);
            cb(p, "conformer.layers.{}.self_attn.linear_pos", il);
            p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]);
            p = ggml_permute(ctx0, p, 0, 2, 1, 3);
            auto * matrix_bd = ggml_mul_mat(ctx0, Q_bias_v, p);
            matrix_bd        = ggml_cont(ctx0, ggml_permute(ctx0, matrix_bd, 1, 0, 2, 3));
            // rel shift
            {
                const auto pos_len = matrix_bd->ne[0];
                const auto q_len   = matrix_bd->ne[1];
                const auto h       = matrix_bd->ne[2];
                matrix_bd          = ggml_pad(ctx0, matrix_bd, 1, 0, 0, 0);
                matrix_bd          = ggml_roll(ctx0, matrix_bd, 1, 0, 0, 0);
                matrix_bd          = ggml_reshape_3d(ctx0, matrix_bd, q_len, pos_len + 1, h);
                matrix_bd          = ggml_view_3d(ctx0, matrix_bd, q_len, pos_len, h, matrix_bd->nb[1],
                                                        matrix_bd->nb[2], matrix_bd->nb[0] * q_len);
                matrix_bd          = ggml_cont_3d(ctx0, matrix_bd, pos_len, q_len, h);
            }
            matrix_bd     = ggml_view_3d(ctx0, matrix_bd, matrix_ac->ne[0], matrix_bd->ne[1],
                                               matrix_bd->ne[2], matrix_bd->nb[1], matrix_bd->nb[2], 0);
            auto * scores = ggml_add(ctx0, matrix_ac, matrix_bd);
            scores        = ggml_scale(ctx0, scores, 1.0f / std::sqrt(d_head));
            cb(scores, "conformer.layers.{}.self_attn.id0", il);
            ggml_tensor * attn = ggml_soft_max(ctx0, scores);
            ggml_tensor * x    = ggml_mul_mat(ctx0, attn, Vcur);
            x                  = ggml_permute(ctx0, x, 2, 0, 1, 3);
            x                  = ggml_cont_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]);
            ggml_tensor * out = ggml_mul_mat(ctx0, layer.o_w, x);
            out               = ggml_add(ctx0, out, layer.o_b);
            cb(out, "conformer.layers.{}.self_attn.linear_out", il);
            cur = out;
        }
        residual = ggml_add(ctx0, residual, cur);
        cur      = build_norm(residual, layer.norm_conv_w, layer.norm_conv_b, NORM_TYPE_NORMAL, 1e-5, il);
        cb(cur, "conformer.layers.{}.norm_conv", il);
        // conv
        {
            auto * x = cur;
            x = ggml_mul_mat(ctx0, layer.conv_pw1_w, x);
            x = ggml_add(ctx0, x, layer.conv_pw1_b);
            cb(x, "conformer.layers.{}.conv.pointwise_conv1", il);
            // ggml_glu doesn't support sigmoid
            // TODO @ngxson : support this ops in ggml
            {
                int64_t       d    = x->ne[0] / 2;
                ggml_tensor * gate = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0]));
                x                  = ggml_mul(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate);
                x                  = ggml_cont(ctx0, ggml_transpose(ctx0, x));
            }
            // use ggml_ssm_conv for f32 precision
            x = ggml_pad(ctx0, x, 4, 0, 0, 0);
            x = ggml_roll(ctx0, x, 4, 0, 0, 0);
            x = ggml_pad(ctx0, x, 4, 0, 0, 0);
            x = ggml_ssm_conv(ctx0, x, layer.conv_dw_w);
            x = ggml_add(ctx0, x, layer.conv_dw_b);
            x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b);
            x = ggml_silu(ctx0, x);
            // pointwise_conv2
            x = ggml_mul_mat(ctx0, layer.conv_pw2_w, x);
            x = ggml_add(ctx0, x, layer.conv_pw2_b);
            cur = x;
        }
        residual = ggml_add(ctx0, residual, cur);
        cur = build_norm(residual, layer.ff_norm_1_w, layer.ff_norm_1_b, NORM_TYPE_NORMAL, 1e-5, il);
        cb(cur, "conformer.layers.{}.norm_feed_forward2", il);
        cur = build_ffn(cur, layer.ff_up_1_w, layer.ff_up_1_b, nullptr, nullptr, layer.ff_down_1_w, layer.ff_down_1_b,
                        FFN_SILU, il);  // TODO(tarek): read activation for ffn from hparams
        cb(cur, "conformer.layers.{}.feed_forward2.linear2", il);
        residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor));
        cb(residual, "conformer.layers.{}.conv.id", il);
        cur = build_norm(residual, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, 1e-5, il);
        cb(cur, "conformer.layers.{}.norm_out", il);
    }
    // audio adapter
    cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
    cb(cur, "audio_adapter.model.{}", 0);
    cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_3_w, model.mm_3_b, FFN_GELU_ERF, -1);
    cb(cur, "projected", -1);
    ggml_build_forward_expand(gf, cur);
    return gf;
 }
--- a/llama/llama.cpp/tools/mtmd/models/mobilenetv5.cpp
+++ b/llama/llama.cpp/tools/mtmd/models/mobilenetv5.cpp
@@ -1,451 +0,0 @@
 #include "models.h"
 // Helpers for MobileNetV5 Blocks
 // RMS Norm 2D - normalizes over channels for each spatial position
 ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) {
    // inp: [W, H, C, B]
    ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3);
    cur = ggml_cont(ctx0, cur);
    cur = ggml_rms_norm(ctx0, cur, eps);
    if (weight) {
        cur = ggml_mul(ctx0, cur, weight);
    }
    cur = ggml_permute(ctx0, cur, 2, 1, 0, 3);
    cur = ggml_cont(ctx0, cur);
    return cur;
 }
 // Conv2dSame padding - asymmetric SAME padding like PyTorch/TF
 ggml_tensor* clip_graph_mobilenetv5::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) {
    const int64_t ih = inp->ne[1];  // height
    const int64_t iw = inp->ne[0];  // width
    // Calculate output size (ceil division)
    const int64_t oh = (ih + stride_h - 1) / stride_h;
    const int64_t ow = (iw + stride_w - 1) / stride_w;
    // Calculate padding needed
    const int64_t pad_h = std::max((int64_t)0, (oh - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - ih);
    const int64_t pad_w = std::max((int64_t)0, (ow - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - iw);
    // Split padding asymmetrically
    const int pad_h_top = pad_h / 2;
    const int pad_h_bottom = pad_h - pad_h_top;
    const int pad_w_left = pad_w / 2;
    const int pad_w_right = pad_w - pad_w_left;
    // Apply padding if needed
    // ggml_pad_ext: (ctx, tensor, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3)
    // For [W, H, C, B]: p0=width, p1=height, p2=channels, p3=batch
    if (pad_h > 0 || pad_w > 0) {
        inp = ggml_pad_ext(ctx0, inp,
            pad_w_left, pad_w_right,     // width padding (dim 0)
            pad_h_top, pad_h_bottom,      // height padding (dim 1)
            0, 0,                         // no channel padding (dim 2)
            0, 0);                        // no batch padding (dim 3)
    }
    return inp;
 }
 // Edge Residual Block (Stage 0)
 ggml_tensor * clip_graph_mobilenetv5::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
    ggml_tensor * cur = inp;
    // 1. Expansion Conv (3x3)
    if (stride == 2) {
        // Case: Downsampling (Block 0)
        // Replicates Conv2dSame(kernel=3, stride=2)
        cur = pad_same_2d(cur, 3, 3, stride, stride);
        cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1);
    } else {
        // Case: Normal 3x3 Block (Block 1, 2)
        // Replicates Conv2d(kernel=3, stride=1, padding=1)
        cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1);
    }
    // BN + Activation
    if (block.s0_bn1_w) cur = rms_norm_2d(cur, block.s0_bn1_w);
    cur = ggml_gelu(ctx0, cur);
    // 2. Pointwise Linear Conv (1x1)
    // 1x1 Convs usually have padding=0 and stride=1
    cur = ggml_conv_2d_direct(ctx0, block.s0_conv_pwl_w, cur, 1, 1, 0, 0, 1, 1);
    if (block.s0_bn2_w) cur = rms_norm_2d(cur, block.s0_bn2_w);
    // 3. Residual Connection
    // Only apply residual if spatial dimensions and channels match (stride 1)
    if (stride == 1 && inp->ne[2] == cur->ne[2] && inp->ne[0] == cur->ne[0]) {
        cur = ggml_add(ctx0, cur, inp);
    }
    return cur;
 }
 // Universal Inverted Residual Block (Stage 1+)
 ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
    ggml_tensor * cur = inp;
    // 1. Depthwise Start (Optional)
    // NOTE: dw_start always has stride=1 (no downsampling here)
    if (block.dw_start_w) {
        int k = block.dw_start_w->ne[0]; // 3 or 5
        int p = k / 2;
        cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1);
        if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w);
    }
    // 2. Pointwise Expansion (1x1)
    if (block.pw_exp_w) {
        // Standard 1x1 conv, pad=0, stride=1
        cur = ggml_conv_2d_direct(ctx0, block.pw_exp_w, cur, 1, 1, 0, 0, 1, 1);
        if (block.pw_exp_bn_w) cur = rms_norm_2d(cur, block.pw_exp_bn_w);
        cur = ggml_gelu(ctx0, cur);
    }
    // 3. Depthwise Mid (Optional)
    // NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage)
    if (block.dw_mid_w) {
        int k = block.dw_mid_w->ne[0]; // 3 or 5
        if (stride > 1) {
            // Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding
            cur = pad_same_2d(cur, k, k, stride, stride);
            cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, 0, 0, 1, 1); // pad=0
        } else {
            // Case: Stride 1 -> Use Standard Symmetric Padding
            int p = k / 2;
            cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, p, p, 1, 1);
        }
        if (block.dw_mid_bn_w) cur = rms_norm_2d(cur, block.dw_mid_bn_w);
        cur = ggml_gelu(ctx0, cur);
    }
    // 4. Pointwise Projection (1x1)
    if (block.pw_proj_w) {
        cur = ggml_conv_2d_direct(ctx0, block.pw_proj_w, cur, 1, 1, 0, 0, 1, 1);
        if (block.pw_proj_bn_w) cur = rms_norm_2d(cur, block.pw_proj_bn_w);
    }
    // Apply Layer Scaling if present
    if (block.layer_scale_w) {
        cur = ggml_mul(ctx0, cur, block.layer_scale_w);
    }
    // 5. Residual Connection
    bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]);
    bool same_channel = (inp->ne[2] == cur->ne[2]);
    if (same_spatial && same_channel) {
        cur = ggml_add(ctx0, cur, inp);
    }
    return cur;
 }
 // Attention Block (MQA)
 ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) {
    ggml_tensor * cur = inp;
    // Norm
    if (block.attn_norm_w) {
        cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f);
    }
    // 1. Q Calculation
    ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1);
    // 2. K Calculation (Downsampled)
    // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
    ggml_tensor * k_inp = cur;
    if (block.attn_k_dw_w) {
        int k_size = block.attn_k_dw_w->ne[0];  // Usually 3
        k_inp = pad_same_2d(cur, k_size, k_size, 2, 2);  // Apply SAME padding
        k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1);  // padding=0
        if (block.attn_k_norm_w) {
            k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f);
        }
    }
    ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1);
    // 3. V Calculation (Downsampled)
    // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
    ggml_tensor * v_inp = cur;
    if (block.attn_v_dw_w) {
        int v_size = block.attn_v_dw_w->ne[0];  // Usually 3
        v_inp = pad_same_2d(cur, v_size, v_size, 2, 2);  // Apply SAME padding
        v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1);  // padding=0
        if (block.attn_v_norm_w) {
            v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f);
        }
    }
    ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1);
    const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3];
    const int D = k->ne[2]; // Head dimension
    const int n_head = q->ne[2] / D;
    const int N = W * H;
    // Process Q: [W, H, D*n_head, B] -> [D, N, n_head, B]
    q = ggml_reshape_3d(ctx0, q, N, D*n_head, B);
    q = ggml_reshape_4d(ctx0, q, N, D, n_head, B);
    q = ggml_permute(ctx0, q, 1, 0, 2, 3); // [D, N, n_head, B]
    q = ggml_cont(ctx0, q);
    const int Wk = k->ne[0]; const int Hk = k->ne[1];
    const int M = Wk * Hk;
    // Process K: [Wk, Hk, D, B] -> [D, M, 1, B]
    k = ggml_reshape_3d(ctx0, k, M, D, B);
    k = ggml_reshape_4d(ctx0, k, M, D, 1, B);
    k = ggml_permute(ctx0, k, 1, 0, 2, 3); // [D, M, 1, B]
    k = ggml_cont(ctx0, k);
    // Process V: [Wk, Hk, D, B] -> [M, D, 1, B]
    v = ggml_reshape_3d(ctx0, v, M, D, B);
    v = ggml_reshape_4d(ctx0, v, M, D, 1, B);
    v = ggml_cont(ctx0, v); // [M, D, 1, B]
    // Multi-Query Attention
    float scale = 1.0f / sqrtf((float)D);
    // Step 1: Compute Q @ K.T
    ggml_tensor * scores = ggml_mul_mat(ctx0, k, q);
    scores = ggml_scale(ctx0, scores, scale);
    scores = ggml_soft_max(ctx0, scores);
    ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores);
    kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3);
    kqv = ggml_cont(ctx0, kqv);
    kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B);
    kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B);
    kqv = ggml_cont(ctx0, kqv);
    // Output projection
    cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1);
    // Residual & Layer Scale
    if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) {
        if (block.layer_scale_w) {
            cur = ggml_mul(ctx0, cur, block.layer_scale_w);
        }
        cur = ggml_add(ctx0, cur, inp);
    }
    return cur;
 }
 ggml_cgraph * clip_graph_mobilenetv5::build() {
    ggml_tensor * inp = build_inp_raw();
    // 1. Stem - Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2))
    ggml_tensor * cur = pad_same_2d(inp, 3, 3, 2, 2);  // Apply SAME padding
    cur = ggml_conv_2d_direct(ctx0, model.mobilenet_stem_conv_w, cur, 2, 2, 0, 0, 1, 1);  // padding=0
    if (model.mobilenet_stem_conv_b) {
        cur = ggml_add(ctx0, cur, model.mobilenet_stem_conv_b);
    }
    if (model.mobilenet_stem_norm_w) cur = rms_norm_2d(cur, model.mobilenet_stem_norm_w);
    cur = ggml_gelu(ctx0, cur);
    // 2. Blocks
    std::vector<ggml_tensor*> intermediate_features;
    const int total_blocks = model.mobilenet_blocks.size();
    auto is_stage_start = [&](int i) {
        if (i == 0) return true;
        for (int end_idx : model.mobilenet_stage_ends) {
            if (i == end_idx + 1) return true;
        }
        return false;
    };
    auto is_fusion_point = [&](int i) {
        if (model.mobilenet_stage_ends.size() >= 4) {
                if (i == model.mobilenet_stage_ends[2]) return true; // End of Stage 2
                if (i == model.mobilenet_stage_ends[3]) return true; // End of Stage 3
        } else {
            if (i == total_blocks - 1) return true;
        }
        return false;
    };
    for (int i = 0; i < total_blocks; i++) {
        const auto & block = model.mobilenet_blocks[i];
        int stride = is_stage_start(i) ? 2 : 1;
        if (block.s0_conv_exp_w)      cur = build_edge_residual(cur, block, stride);
        else if (block.attn_q_w)      cur = build_mobilenet_attn(cur, block);
        else                          cur = build_inverted_residual(cur, block, stride);
        if (is_fusion_point(i)) {
            intermediate_features.push_back(cur);
        }
    }
    // 3. Multi-Scale Fusion Adapter (MSFA)
    if (!intermediate_features.empty()) {
        // A. Reference Resolution: PyTorch implementation uses inputs[0]
        // We assume intermediate_features[0] is the "High Resolution" target.
        // In MobileNet designs, this is typically the feature map with the smallest stride (e.g. 32x32).
        ggml_tensor* target_feat = intermediate_features[0];
        int high_res_w = target_feat->ne[0];
        int high_res_h = target_feat->ne[1];
        std::vector<ggml_tensor*> resized_feats;
        // B. Resize inputs to match inputs[0] (High Resolution)
        for (auto feat : intermediate_features) {
            int feat_w = feat->ne[0];
            int feat_h = feat->ne[1];
            // PyTorch: if feat_size < high_resolution: interpolate
            if (feat_w < high_res_w || feat_h < high_res_h) {
                // Calculate scale factor.
                // Note: PyTorch 'nearest' works on arbitrary float scales.
                // ggml_upscale generally takes integer factors or target sizes depending on helper.
                // Assuming standard power-of-2 scaling (e.g. 16 -> 32 means scale=2).
                int scale_w = high_res_w / feat_w;
                // int scale_h = high_res_h / feat_h;
                // Safety check for non-integer scaling if strictly replicating
                GGML_ASSERT(high_res_w % feat_w == 0);
                // Upsample (Nearest Neighbor)
                // 2 is the scale factor
                feat = ggml_upscale(ctx0, feat, scale_w, ggml_scale_mode::GGML_SCALE_MODE_NEAREST);
            }
            resized_feats.push_back(feat);
        }
        // C. Concatenate at High Resolution (Channel Dim = 2 in ggml)
        cur = resized_feats[0];
        for (size_t k = 1; k < resized_feats.size(); ++k) {
            cur = ggml_concat(ctx0, cur, resized_feats[k], 2);
        }
        // D. FFN (UniversalInvertedResidual)
        // Structure: Expand Conv -> Norm -> GELU -> Project Conv -> Norm
        // 1. Expansion
        if (model.msfa_ffn_expand_w) {
            // 1x1 Conv
            cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_expand_w, cur, 1, 1, 0, 0, 1, 1);
            if (model.msfa_ffn_expand_bn) {
                cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn);
            }
            cur = ggml_gelu(ctx0, cur);
        }
        // 2. Projection (No DW because kernel_size=0)
        if (model.msfa_ffn_project_w) {
            // 1x1 Conv
            cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_project_w, cur, 1, 1, 0, 0, 1, 1);
            // UniversalInvertedResidual typically has a norm after projection
            if (model.msfa_ffn_project_bn) {
                cur = rms_norm_2d(cur, model.msfa_ffn_project_bn);
            }
        }
        // E. Final Downsample to Target Resolution (Output Resolution)
        // PyTorch: matches self.output_resolution (e.g. 16x16)
        const int target_out_res = 16;
        int current_w = cur->ne[0];
        if (current_w > target_out_res) {
            int s = current_w / target_out_res;
            GGML_ASSERT(current_w % target_out_res == 0);
            // Avg Pool: Kernel=s, Stride=s
            cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0);
        }
        // F. Final Norm
        if (model.msfa_concat_norm_w) {
            cur = rms_norm_2d(cur, model.msfa_concat_norm_w);
        }
    }
    // 4. Gemma 3n Multimodal Projection (Embedder)
    // Input: 'cur' is [Width, Height, Channels, Batch]
    int W = cur->ne[0];
    int H = cur->ne[1];
    int C = cur->ne[2];
    int B = cur->ne[3];
    GGML_ASSERT(C == hparams.n_embd);
    // 1. Permute and Flatten to [Channels, Tokens, Batch]
    // PyTorch expects (Batch, Seq, Hidden), GGML usually processes (Hidden, Seq, Batch)
    cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); // -> [C, H, W, B]
    cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // -> [C, W, H, B]
    cur = ggml_cont(ctx0, cur);
    cur = ggml_reshape_3d(ctx0, cur, C, W*H, B);
    cur = ggml_cont(ctx0, cur);
    // 2. FEATURE SCALING
    // PyTorch: vision_outputs *= self.config.vision_config.hidden_size**0.5
    const float scale_factor = sqrtf((float)C);
    cur = ggml_scale(ctx0, cur, scale_factor);
    // 3. SOFT EMBEDDING NORM
    // PyTorch: self._norm(x) * self.weight
    // We must normalize regardless, then multiply if weight exists.
    {
        const float eps = 1e-6f; // Gemma3n uses 1e-6
        cur = ggml_rms_norm(ctx0, cur, eps);
        if (model.mm_soft_emb_norm_w) {
            // Weight shape is (2048,) -> Element-wise broadcast multiply
            cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
        }
    }
    // 4. PROJECTION
    // PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False)
    // Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size]
    if (model.mm_input_proj_w) {
        cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);
    }
    // 5. POST PROJECTION NORM
    // PyTorch: embedding_post_projection_norm = Gemma3nRMSNorm(..., with_scale=False)
    // with_scale=False means weight is registered as buffer with value 1.0
    // So output = rms_norm(x) * 1.0 = rms_norm(x), magnitude ~1
    {
        const float eps = 1e-6f;
        cur = ggml_rms_norm(ctx0, cur, eps);
        if (model.mm_post_proj_norm_w) {
            // If weight is loaded, multiply (should be ~1.0 anyway)
            cur = ggml_mul(ctx0, cur, model.mm_post_proj_norm_w);
        }
    }
    ggml_build_forward_expand(gf, cur);
    return gf;
 }
--- a/llama/llama.cpp/tools/mtmd/models/models.h
+++ b/llama/llama.cpp/tools/mtmd/models/models.h
@@ -2,11 +2,6 @@
 #include "../clip-graph.h"
 /*
 * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
 * We encourage human contributors to ensure the quality and reliability of the codebase.
 */
 struct clip_graph_siglip : clip_graph {
    clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
@@ -27,11 +22,6 @@ struct clip_graph_qwen3vl : clip_graph {
    ggml_cgraph * build() override;
 };
 struct clip_graph_youtuvl : clip_graph {
    clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
 };
 struct clip_graph_minicpmv : clip_graph {
    clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
@@ -67,45 +57,7 @@ struct clip_graph_whisper_enc : clip_graph {
    ggml_cgraph * build() override;
 };
 struct clip_graph_conformer : clip_graph {
    clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
 };
 struct clip_graph_glm4v : clip_graph {
    clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
 };
 struct clip_graph_mobilenetv5 : clip_graph {
    clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
    ggml_tensor * rms_norm_2d(
        ggml_tensor * inp,
        ggml_tensor * weight,
        float eps = 1e-6f);
    ggml_tensor* pad_same_2d(
        ggml_tensor* inp,
        int kernel_h,
        int kernel_w,
        int stride_h,
        int stride_w,
        int dilation_h = 1,
        int dilation_w = 1);
    ggml_tensor * build_edge_residual(
        ggml_tensor * inp,
        const mobilenetv5_block & block,
        int stride);
    ggml_tensor * build_inverted_residual(
        ggml_tensor * inp,
        const mobilenetv5_block & block,
        int stride);
    ggml_tensor * build_mobilenet_attn(
        ggml_tensor * inp,
        const mobilenetv5_block & block);
 };
--- a/llama/llama.cpp/tools/mtmd/models/siglip.cpp
+++ b/llama/llama.cpp/tools/mtmd/models/siglip.cpp
@@ -50,15 +50,10 @@ ggml_cgraph * clip_graph_siglip::build() {
        const int scale_factor = model.hparams.n_merge;
        cur = build_patch_merge_permute(cur, scale_factor);
-        // projection, in LFM2-VL input norm is optional
+        // projection
-        if (model.mm_input_norm_w) {
+        cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
-            cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
+        cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
-            cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
+        cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
        }
        if (model.mm_input_norm_b) {
            cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
        }
        cur = build_ffn(cur,
            model.mm_1_w, model.mm_1_b,
--- a/llama/llama.cpp/tools/mtmd/models/whisper-enc.cpp
+++ b/llama/llama.cpp/tools/mtmd/models/whisper-enc.cpp
@@ -86,15 +86,6 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
            FFN_GELU_ERF,
            -1);
    } else if (proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
        // projector
        cur = build_ffn(cur,
            model.mm_1_w, model.mm_1_b,
            nullptr, nullptr,
            model.mm_2_w, model.mm_2_b,
            FFN_GELU_ERF,
            -1);
    } else if (proj_type == PROJECTOR_TYPE_GLMA) {
            cur = ggml_norm(ctx0, cur, hparams.eps);
            cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
--- a/llama/llama.cpp/tools/mtmd/models/youtuvl.cpp
+++ b/llama/llama.cpp/tools/mtmd/models/youtuvl.cpp
@@ -1,179 +0,0 @@
 #include "models.h"
 ggml_cgraph * clip_graph_youtuvl::build() {
    GGML_ASSERT(model.class_embedding == nullptr);
    const int batch_size       = 1;
    const bool use_window_attn = !hparams.wa_layer_indexes.empty();
    const int n_pos            = n_patches;
    const int num_position_ids = n_pos * 4;
    const int m = 2;
    const int Wp = n_patches_x;
    const int Hp = n_patches_y;
    const int Hm = Hp / m;
    const int Wm = Wp / m;
    norm_type norm_t = NORM_TYPE_NORMAL;
    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
    ggml_tensor * inp = build_inp_raw();
    // change conv3d to linear
    // reshape and permute to get patches, permute from (patch_size, m, Wm, patch_size, m, Hm, C) to (C, patch_size, patch_size, m, m, Wm, Hm)
    {
        inp = ggml_reshape_4d(
            ctx0, inp,
            Wm * m * patch_size, m * patch_size, Hm, 3);
        inp = ggml_permute(ctx0, inp, 1, 2, 3, 0);
        inp = ggml_cont_4d(
            ctx0, inp,
            m * patch_size * 3, Wm, m * patch_size, Hm);
        inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
        inp = ggml_cont_4d(
            ctx0, inp,
            m * patch_size * 3, patch_size, m, Hm * Wm);
        inp = ggml_permute(ctx0, inp, 1, 0, 2, 3);
        inp = ggml_cont_4d(
            ctx0, inp,
            patch_size, 3, patch_size, Hm * Wm * m * m);
        inp = ggml_permute(ctx0, inp, 2, 0, 1, 3);
        inp = ggml_cont_3d(
            ctx0, inp,
            3*patch_size* patch_size,  Hm * Wm * m * m, 1);
    }
    inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
    if (model.patch_bias) {
        inp = ggml_add(ctx0, inp, model.patch_bias);
    }
    inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
    ggml_tensor * inpL           = inp;
    ggml_tensor * window_mask    = nullptr;
    ggml_tensor * window_idx     = nullptr;
    ggml_tensor * inv_window_idx = nullptr;
    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
    ggml_set_name(positions, "positions");
    ggml_set_input(positions);
    // pre-layernorm
    if (model.pre_ln_w) {
        inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
    }
    if (use_window_attn) {
        inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
        ggml_set_name(inv_window_idx, "inv_window_idx");
        ggml_set_input(inv_window_idx);
        // mask for window attention
        window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
        ggml_set_name(window_mask, "window_mask");
        ggml_set_input(window_mask);
        // if flash attn is used, we need to pad the mask and cast to f16
        if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
            window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
        }
        // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
        GGML_ASSERT(batch_size == 1);
        inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
        inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
        inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
    }
    // loop over layers
    for (int il = 0; il < n_layer; il++) {
        const auto & layer = model.layers[il];
        const bool full_attn = use_window_attn ? hparams.wa_layer_indexes.count(il) > 0 : true;
        ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
        // layernorm1
        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
        // self-attention
        {
            ggml_tensor * Qcur = ggml_add(ctx0,
                ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
            ggml_tensor * Kcur = ggml_add(ctx0,
                ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
            ggml_tensor * Vcur = ggml_add(ctx0,
                ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
            Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
            Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
            Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
            Qcur = ggml_rope_multi(
                ctx0, Qcur, positions, nullptr,
                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
            Kcur = ggml_rope_multi(
                ctx0, Kcur, positions, nullptr,
                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
            ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
            cur = build_attn(layer.o_w, layer.o_b,
                Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
        }
        // re-add the layer input, e.g., residual
        cur = ggml_add(ctx0, cur, inpL);
        inpL = cur; // inpL = residual, cur = hidden_states
        // layernorm2
        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
        // ffn
        cur = build_ffn(cur,
            layer.ff_up_w, layer.ff_up_b,
            nullptr, nullptr,
            layer.ff_down_w, layer.ff_down_b,
            hparams.ffn_op, il);
        // residual 2
        cur = ggml_add(ctx0, inpL, cur);
        inpL = cur;
    }
    ggml_tensor * embeddings = inpL;
    if (use_window_attn) {
        const int spatial_merge_unit = 4;
        window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / spatial_merge_unit);
        ggml_set_name(window_idx, "window_idx");
        ggml_set_input(window_idx);
        GGML_ASSERT(batch_size == 1);
        embeddings = ggml_reshape_2d(ctx0, embeddings, n_embd * spatial_merge_unit, n_patches / spatial_merge_unit);
        embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
        embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd, n_patches, batch_size);
        cb(embeddings, "window_order_restored", -1);
    }
    // post-layernorm (part of Siglip2VisionTransformer, applied after encoder)
    if (model.post_ln_w) {
        embeddings = build_norm(embeddings, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
    }
    // Now apply merger (VLPatchMerger):
    // 1. Apply RMS norm (ln_q in VLPatchMerger)
    embeddings = build_norm(embeddings, model.mm_input_norm_w, nullptr, NORM_TYPE_RMS, 1e-6, -1);
    cb(embeddings, "merger_normed", -1);
    // 2. First reshape for spatial merge (merge 2x2 patches)
    embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
    cb(embeddings, "merger_reshaped", -1);
    embeddings = build_ffn(embeddings,
                    model.mm_0_w, model.mm_0_b,
                    nullptr, nullptr,
                    model.mm_1_w, model.mm_1_b,
                    FFN_GELU,
                    -1);
    ggml_build_forward_expand(gf, embeddings);
    return gf;
 }
--- a/llama/llama.cpp/tools/mtmd/mtmd-audio.cpp
+++ b/llama/llama.cpp/tools/mtmd/mtmd-audio.cpp
@@ -9,250 +9,207 @@
 #include <fstream>
 #include <algorithm>
-// some of the code here is copied from whisper.cpp
+// most of the code here is copied from whisper.cpp
 constexpr bool DEBUG = false;
-void mtmd_audio_cache::fill_sin_cos_table(int n) {
+struct mtmd_audio_mel_filters {
-    sin_vals.resize(n);
+    int32_t n_mel;
-    cos_vals.resize(n);
+    int32_t n_fft;
    for (int i = 0; i < n; i++) {
        double theta = (2 * M_PI * i) / n;
        sin_vals[i]  = sinf(theta);
        cos_vals[i]  = cosf(theta);
    }
 }
-void mtmd_audio_cache::fill_hann_window(int length, bool periodic) {
+    std::vector<float> data;
-    hann_window.resize(length);
+};
    int offset = -1;
    if (periodic) {
        offset = 0;
    }
    for (int i = 0; i < length; i++) {
        hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
    }
 }
-void mtmd_audio_cache::fill_mel_filterbank_matrix(int   n_mel,
+// note: this global cache is shared among all preprocessors
-                                                  int   n_fft,
+//       if we want to use multiple preprocessors at the same time,
-                                                  int   sample_rate,
+//       we will need to enclose it in the preprocessor class in the future
-                                                  float fmin,
+static struct mtmd_audio_global_cache {
-                                                  float fmax,
+    // precomputed sin/cos table for FFT
-                                                  bool  slaney_area_norm,
+    std::vector<float> sin_vals;
-                                                  float scale) {
+    std::vector<float> cos_vals;
    GGML_ASSERT(n_mel > 0 && n_fft > 1);
    if (fmax <= 0.0f) {
        fmax = 0.5f * sample_rate;
    }
-    // Slaney scale (matches librosa default)
+    // hann window
-    const double min_log_hz  = 1000.0;
+    std::vector<float> hann_window;
    const double lin_slope   = 3 / 200.;
    const double min_log_mel = min_log_hz * lin_slope;
    const double log_step    = log(6.4) / 27.0;
    auto         hz_to_mel   = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double {
        return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step;
    };
    auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double {
        return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step);
    };
-    // infer N_fft from n_fft_bins
+    // mel filter bank
-    const double bin_hz_step = double(sample_rate) / double(n_fft);
+    mtmd_audio_mel_filters filters;
-    // mel grid: n_mel + 2 edges
+    void fill_sin_cos_table(int n) {
-    const double        m_lo = hz_to_mel(fmin);
+        sin_vals.resize(n);
-    const double        m_hi = hz_to_mel(fmax);
+        cos_vals.resize(n);
-    std::vector<double> mel_pts(n_mel + 2);
+        for (int i = 0; i < n; i++) {
-    for (int i = 0; i < n_mel + 2; ++i) {
+            double theta = (2 * M_PI * i) / n;
-        mel_pts[i] = m_lo + (m_hi - m_lo) * (double(i) / (n_mel + 1));
+            sin_vals[i] = sinf(theta);
-    }
+            cos_vals[i] = cosf(theta);
    // convert to Hz
    std::vector<double> hz_pts(n_mel + 2);
    for (int i = 0; i < n_mel + 2; ++i) {
        hz_pts[i] = mel_to_hz(mel_pts[i]);
    }
    const int n_fft_bins = n_fft / 2 + 1;
    // filterbank
    std::vector<float> out(n_mel * n_fft_bins, 0);
    for (int m = 0; m < n_mel; ++m) {
        const double f_left   = hz_pts[m];
        const double f_center = hz_pts[m + 1];
        const double f_right  = hz_pts[m + 2];
        const double denom_l = std::max(1e-30, f_center - f_left);
        const double denom_r = std::max(1e-30, f_right - f_center);
        const double enorm   = slaney_area_norm ? (2.0 / std::max(1e-30, f_right - f_left)) : 1.0;
        for (int k = 0; k < n_fft_bins; ++k) {
            const double f = k * bin_hz_step;
            double       w = 0.0;
            if (f >= f_left && f <= f_center) {
                w = (f - f_left) / denom_l;
            } else if (f > f_center && f <= f_right) {
                w = (f_right - f) / denom_r;
            }
            out[size_t(m) * size_t(n_fft_bins) + size_t(k)] = float(w * enorm * scale);
        }
    }
-    filters.n_mel = n_mel;
+    void fill_hann_window(int length, bool periodic) {
-    filters.n_fft = n_fft;
+        hann_window.resize(length);
-    filters.data  = std::move(out);
+        int offset = -1;
        if (periodic) {
            offset = 0;
        }
        for (int i = 0; i < length; i++) {
            hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
        }
    }
-    if (DEBUG) {  // debug
+    // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
-        for (size_t i = 0; i < filters.data.size(); ++i) {
+    // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
-            if (filters.data[i] != 0.0f) {
+    void fill_mel_filterbank_matrix(
-                printf("filters[%zu] = %f\n", i, filters.data[i] * 1000.0f);
+        int n_mel,
        int n_fft,
        int sample_rate,            // e.g. 16000
        float fmin = 0.0f,          // e.g. 0.0
        float fmax = -1.0f,         // e.g. sr/2; pass -1 for auto
        bool slaney_area_norm = true,
        float scale = 1.0f          // optional extra scaling; use 1.0f/1000.0f to mimic your code
    ) {
        GGML_ASSERT(n_mel > 0 && n_fft > 1);
        if (fmax <= 0.0f) {
            fmax = 0.5f * sample_rate;
        }
        // Slaney scale (matches librosa default)
        const double min_log_hz = 1000.0;
        const double lin_slope = 3 / 200.;
        const double min_log_mel = min_log_hz * lin_slope;
        const double log_step = log(6.4) / 27.0;
        auto hz_to_mel = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double {
            return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step;
        };
        auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double {
            return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step);
        };
        // infer N_fft from n_fft_bins
        const double bin_hz_step = double(sample_rate) / double(n_fft);
        // mel grid: n_mel + 2 edges
        const double m_lo = hz_to_mel(fmin);
        const double m_hi = hz_to_mel(fmax);
        std::vector<double> mel_pts(n_mel + 2);
        for (int i = 0; i < n_mel + 2; ++i) {
            mel_pts[i] = m_lo + (m_hi - m_lo) * (double(i) / (n_mel + 1));
        }
        // convert to Hz
        std::vector<double> hz_pts(n_mel + 2);
        for (int i = 0; i < n_mel + 2; ++i) {
            hz_pts[i] = mel_to_hz(mel_pts[i]);
        }
        const int n_fft_bins = n_fft / 2 + 1;
        // filterbank
        std::vector<float> out(n_mel * n_fft_bins, 0);
        for (int m = 0; m < n_mel; ++m) {
            const double f_left   = hz_pts[m];
            const double f_center = hz_pts[m + 1];
            const double f_right  = hz_pts[m + 2];
            const double denom_l = std::max(1e-30, f_center - f_left);
            const double denom_r = std::max(1e-30, f_right  - f_center);
            const double enorm   = slaney_area_norm ? (2.0 / std::max(1e-30, f_right - f_left)) : 1.0;
            for (int k = 0; k < n_fft_bins; ++k) {
                const double f = k * bin_hz_step;
                double w = 0.0;
                if (f >= f_left && f <= f_center) {
                    w = (f - f_left) / denom_l;
                } else if (f > f_center && f <= f_right) {
                    w = (f_right - f) / denom_r;
                }
                out[size_t(m) * size_t(n_fft_bins) + size_t(k)] = float(w * enorm * scale);
            }
        }
        filters.n_mel = n_mel;
        filters.n_fft = n_fft;
        filters.data  = std::move(out);
        if (DEBUG) { // debug
            for (size_t i = 0; i < filters.data.size(); ++i) {
                if (filters.data[i] != 0.0f) {
                    printf("filters[%zu] = %f\n", i, filters.data[i] * 1000.0f);
                }
            }
        }
    }
-}
+} g_cache;
-// Unified DFT implementation for both forward and inverse transforms
+// naive Discrete Fourier Transform
-// Template parameters:
+// input is real-valued
-//   Inverse: false = DFT with exp(-2πi·k·n/N), no scaling
+// output is complex-valued
-//            true  = IDFT with exp(+2πi·k·n/N), scales by 1/N
+static void dft(const float * in, int N, float * out) {
-//   RealInput: true = input is real-valued (stride 1), avoids imaginary computations
+    const int n_sin_cos_vals = g_cache.sin_vals.size();
-//              false = input is complex-valued (interleaved real/imag, stride 2)
+    const int sin_cos_step = n_sin_cos_vals / N;
 template <bool Inverse, bool RealInput>
 static void dft_impl(const mtmd_audio_cache & cache, const float * in, int N, float * out) {
    const int n_sin_cos_vals = cache.sin_vals.size();
    const int sin_cos_step   = n_sin_cos_vals / N;
    constexpr float sign  = Inverse ? 1.0f : -1.0f;
    const float     scale = Inverse ? (1.0f / N) : 1.0f;
    for (int k = 0; k < N; k++) {
        float re = 0;
        float im = 0;
        for (int n = 0; n < N; n++) {
-            int   idx     = (k * n * sin_cos_step) % n_sin_cos_vals;
+            int idx = (k * n * sin_cos_step) % (n_sin_cos_vals); // t = 2*M_PI*k*n/N
-            float cos_val = cache.cos_vals[idx];
+            re += in[n] * g_cache.cos_vals[idx]; // cos(t)
-            float sin_val = cache.sin_vals[idx];
+            im -= in[n] * g_cache.sin_vals[idx]; // sin(t)
            if constexpr (RealInput) {
                // Real input: in_im = 0, simplifies to:
                // re += in_re * cos_val
                // im += sign * in_re * sin_val
                float in_re = in[n];
                re += in_re * cos_val;
                im += sign * in_re * sin_val;
            } else {
                float in_re = in[n * 2 + 0];
                float in_im = in[n * 2 + 1];
                // (a + bi) * (cos + sign*i*sin) = (a*cos - sign*b*sin) + (sign*a*sin + b*cos)i
                re += in_re * cos_val - sign * in_im * sin_val;
                im += sign * in_re * sin_val + in_im * cos_val;
            }
        }
-        out[k * 2 + 0] = re * scale;
+        out[k*2 + 0] = re;
-        out[k * 2 + 1] = im * scale;
+        out[k*2 + 1] = im;
    }
 }
-// Cooley-Tukey FFT/IFFT unified implementation
+// Cooley-Tukey FFT
-// Template parameters:
+// poor man's implementation - use something better
-//   Inverse: false = FFT with exp(-2πi·k/N), no scaling
+// input is real-valued
-//            true  = IFFT with exp(+2πi·k/N), scales by 0.5 at each level
+// output is complex-valued
-//   RealInput: true = input is real-valued (stride 1)
+static void fft(float * in, int N, float * out) {
-//              false = input is complex-valued (interleaved real/imag, stride 2)
+    const int n_sin_cos_vals = g_cache.sin_vals.size();
 template <bool Inverse, bool RealInput>
 static void fft_impl(const mtmd_audio_cache & cache, float * in, int N, float * out) {
    const int n_sin_cos_vals = cache.sin_vals.size();
    if (N == 1) {
        out[0] = in[0];
-        if constexpr (RealInput) {
+        out[1] = 0;
            out[1] = 0.0f;
        } else {
            out[1] = in[1];
        }
        return;
    }
    const int half_N = N / 2;
-    if (N - half_N * 2 == 1) {
+    if (N - half_N*2 == 1) {
-        // Odd N: fall back to DFT
+        dft(in, N, out);
        dft_impl<Inverse, RealInput>(cache, in, N, out);
        return;
    }
-    // Split into even and odd
+    float* even = in + N;
-    if constexpr (RealInput) {
+    for (int i = 0; i < half_N; ++i) {
-        // Real input: stride is 1, copy only real values
+        even[i]= in[2*i];
        float * even = in + N;
        for (int i = 0; i < half_N; ++i) {
            even[i] = in[2 * i];
        }
        float * even_fft = out + 2 * N;
        fft_impl<Inverse, true>(cache, even, half_N, even_fft);
        float * odd = even;
        for (int i = 0; i < half_N; ++i) {
            odd[i] = in[2 * i + 1];
        }
        float * odd_fft = even_fft + N;
        fft_impl<Inverse, true>(cache, odd, half_N, odd_fft);
    } else {
        // Complex input: stride is 2, copy complex pairs
        float * even = in + N * 2;
        for (int i = 0; i < half_N; ++i) {
            even[i * 2 + 0] = in[2 * i * 2 + 0];
            even[i * 2 + 1] = in[2 * i * 2 + 1];
        }
        float * even_fft = out + 2 * N;
        fft_impl<Inverse, false>(cache, even, half_N, even_fft);
        float * odd = even;
        for (int i = 0; i < half_N; ++i) {
            odd[i * 2 + 0] = in[(2 * i + 1) * 2 + 0];
            odd[i * 2 + 1] = in[(2 * i + 1) * 2 + 1];
        }
        float * odd_fft = even_fft + N;
        fft_impl<Inverse, false>(cache, odd, half_N, odd_fft);
    }
    float* even_fft = out + 2 * N;
    fft(even, half_N, even_fft);
-    float * even_fft = out + 2 * N;
+    float* odd = even;
-    float * odd_fft  = even_fft + N;
+    for (int i = 0; i < half_N; ++i) {
        odd[i] = in[2*i + 1];
    }
    float* odd_fft = even_fft + N;
    fft(odd, half_N, odd_fft);
    const int sin_cos_step = n_sin_cos_vals / N;
    constexpr float sign  = Inverse ? 1.0f : -1.0f;
    constexpr float scale = Inverse ? 0.5f : 1.0f;
    for (int k = 0; k < half_N; k++) {
-        int   idx = k * sin_cos_step;  // t = 2*M_PI*k/N
+        int idx = k * sin_cos_step; // t = 2*M_PI*k/N
-        float re  = cache.cos_vals[idx];
+        float re =  g_cache.cos_vals[idx]; // cos(t)
-        float im  = sign * cache.sin_vals[idx];
+        float im = -g_cache.sin_vals[idx]; // sin(t)
-        float re_odd = odd_fft[2 * k + 0];
+        float re_odd = odd_fft[2*k + 0];
-        float im_odd = odd_fft[2 * k + 1];
+        float im_odd = odd_fft[2*k + 1];
-        out[2 * k + 0] = scale * (even_fft[2 * k + 0] + re * re_odd - im * im_odd);
+        out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd;
-        out[2 * k + 1] = scale * (even_fft[2 * k + 1] + re * im_odd + im * re_odd);
+        out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd;
-        out[2 * (k + half_N) + 0] = scale * (even_fft[2 * k + 0] - re * re_odd + im * im_odd);
+        out[2*(k + half_N) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd;
-        out[2 * (k + half_N) + 1] = scale * (even_fft[2 * k + 1] - re * im_odd - im * re_odd);
+        out[2*(k + half_N) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd;
    }
 }
 // Forward FFT for real input (used by mel spectrogram)
 static void fft(const mtmd_audio_cache & cache, float * in, int N, float * out) {
    fft_impl<false, true>(cache, in, N, out);
 }
 // Inverse FFT for complex input
 static void ifft(const mtmd_audio_cache & cache, float * in, int N, float * out) {
    fft_impl<true, false>(cache, in, N, out);
 }
 struct filter_params {
    int32_t n_mel;
    int32_t n_fft_bins;
@@ -265,27 +222,20 @@ struct filter_params {
    bool    norm_per_feature = false;
 };
-static void log_mel_spectrogram_worker_thread(int                        ith,
+static void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::vector<float> & samples,
-                                              const float *              hann,
+                                              int n_samples, int frame_size, int frame_step, int n_threads,
-                                              const std::vector<float> & samples,
+                                              const filter_params & params, mtmd_audio_mel & out) {
                                              int                        n_samples,
                                              int                        frame_size,
                                              int                        frame_step,
                                              int                        n_threads,
                                              const filter_params &      params,
                                              const mtmd_audio_cache &   cache,
                                              mtmd_audio_mel &           out) {
    std::vector<float> fft_in(frame_size * 2, 0.0);
    std::vector<float> fft_out(frame_size * 2 * 2 * 2);
    int n_fft_bins = params.n_fft_bins;
    int i = ith;
-    const auto & filters = cache.filters;
+    const auto & filters = g_cache.filters;
    // make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist
    GGML_ASSERT(n_fft_bins == 1 + (frame_size / 2));
-    GGML_ASSERT(cache.sin_vals.size() == cache.cos_vals.size());
+    GGML_ASSERT(g_cache.sin_vals.size() == g_cache.cos_vals.size());
    // calculate FFT only when fft_in are not all zero
    for (; i < std::min(n_samples / frame_step + 1, out.n_len); i += n_threads) {
        const int offset = i * frame_step;
@@ -301,7 +251,7 @@ static void log_mel_spectrogram_worker_thread(int                        ith,
        }
        // FFT
-        fft(cache, fft_in.data(), frame_size, fft_out.data());
+        fft(fft_in.data(), frame_size, fft_out.data());
        // Calculate modulus^2 of complex numbers
        // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting.
@@ -348,7 +298,6 @@ static bool log_mel_spectrogram(
        const int     n_samples_in,
        const int     n_threads,
        const filter_params & params,
        const mtmd_audio_cache & cache,
        mtmd_audio_mel & out) {
    //const int64_t t_start_us = ggml_time_us();
@@ -356,9 +305,9 @@ static bool log_mel_spectrogram(
    int n_samples = n_samples_in;
    // Hann window
-    const float * hann       = cache.hann_window.data();
+    const float * hann = g_cache.hann_window.data();
-    const int     frame_size = (params.n_fft_bins - 1) * 2;
+    const int frame_size = (params.n_fft_bins - 1) * 2;
-    const int     frame_step = params.hop_length;
+    const int frame_step = params.hop_length;
    // Padding
    std::vector<float> samples_padded;
@@ -386,9 +335,9 @@ static bool log_mel_spectrogram(
    // preemphasis
    if (params.preemph) {
-        const int   pad_amount = frame_size / 2;
+        const int pad_amount = frame_size / 2;
        const float preemph = 0.97f;
-        float       prev = samples_padded[pad_amount];
+        float prev = samples_padded[pad_amount];
        for (int i = pad_amount + 1; i + pad_amount < n_samples; ++i) {
            float cur = samples_padded[i];
            samples_padded[i] = cur - preemph * prev;
@@ -423,14 +372,14 @@ static bool log_mel_spectrogram(
    {
        std::vector<std::thread> workers(n_threads - 1);
        for (int iw = 0; iw < n_threads - 1; ++iw) {
-            workers[iw] =
+            workers[iw] = std::thread(
-                std::thread(log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded), n_samples,
+                    log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded),
-                            frame_size, frame_step, n_threads, std::cref(params), std::cref(cache), std::ref(out));
+                    n_samples, frame_size, frame_step, n_threads,
                    std::cref(params), std::ref(out));
        }
        // main thread
-        log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples, frame_size, frame_step, n_threads, params,
+        log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples, frame_size, frame_step, n_threads, params, out);
                                          cache, out);
        for (int iw = 0; iw < n_threads - 1; ++iw) {
            workers[iw].join();
        }
@@ -455,7 +404,7 @@ static bool log_mel_spectrogram(
            for (int j = 0; j < effective_n_len; ++j) {
                auto &value = out.data[i * out.n_len + j];
-                value        = (value - mean) / mstd;
+                value = (value - mean) / mstd;
            }
            // pad the rest with zeros
@@ -501,14 +450,18 @@ static bool log_mel_spectrogram(
 //
 void mtmd_audio_preprocessor_whisper::initialize() {
-    cache.fill_sin_cos_table(hparams.audio_n_fft);
+    g_cache.fill_sin_cos_table(hparams.audio_n_fft);
-    cache.fill_hann_window(hparams.audio_window_len, true);
+    g_cache.fill_hann_window(hparams.audio_window_len, true);
-    cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate);
+    g_cache.fill_mel_filterbank_matrix(
        hparams.n_mel_bins,
        hparams.audio_n_fft,
        hparams.audio_sample_rate);
 }
-bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 samples,
+bool mtmd_audio_preprocessor_whisper::preprocess(
-                                                 size_t                        n_samples,
+        const float * samples,
-                                                 std::vector<mtmd_audio_mel> & output) {
+        size_t n_samples,
        std::vector<mtmd_audio_mel> & output) {
    if (n_samples == 0) {
        // empty audio
        return false;
@@ -518,7 +471,7 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s
    // if input is too short, pad with zeros
    // this is to avoid potential issues with stage1/2 padding in log_mel_spectrogram
    // TODO: maybe handle this better
-    size_t min_samples = (size_t) hparams.audio_sample_rate * (hparams.audio_chunk_len + 1);  // +1 second margin
+    size_t min_samples = (size_t)hparams.audio_sample_rate * (hparams.audio_chunk_len + 1); // +1 second margin
    if (n_samples < min_samples) {
        smpl.resize(min_samples, 0.0f);
        std::memcpy(smpl.data(), samples, n_samples * sizeof(float));
@@ -533,19 +486,22 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s
    params.hop_length       = hparams.audio_hop_len;
    params.sample_rate      = hparams.audio_sample_rate;
    params.center_padding   = false;
-    params.preemph          = 0.0f;  // disabled
+    params.preemph          = 0.0f; // disabled
    params.use_natural_log  = false;
    params.norm_per_feature = false;
-    // make sure the cache is initialized
+    // make sure the global cache is initialized
-    GGML_ASSERT(!cache.sin_vals.empty());
+    GGML_ASSERT(!g_cache.sin_vals.empty());
-    GGML_ASSERT(!cache.cos_vals.empty());
+    GGML_ASSERT(!g_cache.cos_vals.empty());
-    GGML_ASSERT(!cache.filters.data.empty());
+    GGML_ASSERT(!g_cache.filters.data.empty());
    mtmd_audio_mel out_full;
-    bool           ok = log_mel_spectrogram(samples, n_samples,
+    bool ok = log_mel_spectrogram(
-                                            4,  // n_threads
+                samples,
-                                            params, cache, out_full);
+                n_samples,
                4, // n_threads
                params,
                out_full);
    if (!ok) {
        return false;
    }
@@ -556,21 +512,21 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s
        printf("output: n_mel = %d, n_len = %d\n", out_full.n_mel, out_full.n_len);
    }
    const size_t frames_per_chunk = 3000;
-    GGML_ASSERT((size_t) out_full.n_len > frames_per_chunk);
+    GGML_ASSERT((size_t)out_full.n_len > frames_per_chunk);
-    for (size_t off = 0; off < (size_t) out_full.n_len; off += frames_per_chunk) {
+    for (size_t off = 0; off < (size_t)out_full.n_len; off += frames_per_chunk) {
-        int n_len = std::min(frames_per_chunk, (size_t) out_full.n_len - off);
+        int n_len = std::min(frames_per_chunk, (size_t)out_full.n_len - off);
-        if ((size_t) n_len < frames_per_chunk) {
+        if ((size_t)n_len < frames_per_chunk) {
-            break;  // last uncomplete chunk will always be a padded chunk, safe to ignore
+            break; // last uncomplete chunk will always be a padded chunk, safe to ignore
        }
        mtmd_audio_mel out_chunk;
        out_chunk.n_len     = n_len;
        out_chunk.n_mel     = out_full.n_mel;
-        out_chunk.n_len_org = out_full.n_mel;  // unused
+        out_chunk.n_len_org = out_full.n_mel; // unused
        out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len);
        for (int i = 0; i < out_full.n_mel; i++) {
-            auto src = out_full.data.begin() + i * out_full.n_len + off;
+            auto src = out_full.data.begin() + i*out_full.n_len + off;
            out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk);
        }
@@ -579,152 +535,3 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s
    return true;
 }
 //
 // mtmd_audio_preprocessor_conformer
 //
 void mtmd_audio_preprocessor_conformer::initialize() {
    cache.fill_sin_cos_table(hparams.audio_n_fft);
    cache.fill_hann_window(hparams.audio_window_len, true);
    cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate);
 }
 bool mtmd_audio_preprocessor_conformer::preprocess(const float *                 samples,
                                                   size_t                        n_samples,
                                                   std::vector<mtmd_audio_mel> & output) {
    // empty audio
    if (n_samples == 0) {
        return false;
    }
    filter_params params;
    params.n_mel            = hparams.n_mel_bins;
    params.n_fft_bins       = 1 + (hparams.audio_n_fft / 2);
    params.hann_window_size = hparams.audio_window_len;
    params.hop_length       = hparams.audio_hop_len;
    params.sample_rate      = hparams.audio_sample_rate;
    params.center_padding   = true;
    params.preemph          = 0.97f;
    params.use_natural_log  = true;
    params.norm_per_feature = true;
    // make sure the cache is initialized
    GGML_ASSERT(!cache.sin_vals.empty());
    GGML_ASSERT(!cache.cos_vals.empty());
    GGML_ASSERT(!cache.filters.data.empty());
    mtmd_audio_mel out_full;
    bool           ok = log_mel_spectrogram(samples, n_samples,
                                            4,  // n_threads
                                            params, cache, out_full);
    if (!ok) {
        return false;
    }
    output.push_back(std::move(out_full));
    return true;
 }
 //
 // mtmd_audio_streaming_istft implementation
 //
 mtmd_audio_streaming_istft::mtmd_audio_streaming_istft(int n_fft, int hop_length) :
    n_fft(n_fft),
    hop_length(hop_length),
    n_fft_bins(n_fft / 2 + 1),
    overlap_buffer(n_fft, 0.0f),
    window_sum_buffer(n_fft, 0.0f),
    padding_to_remove((n_fft - hop_length) / 2),
    ifft_in(n_fft * 2 * 4, 0.0f),  // extra space for recursive IFFT
    ifft_out(n_fft * 2 * 4, 0.0f) {
    cache.fill_sin_cos_table(n_fft);
    cache.fill_hann_window(n_fft, true);
 }
 void mtmd_audio_streaming_istft::reset() {
    std::fill(overlap_buffer.begin(), overlap_buffer.end(), 0.0f);
    std::fill(window_sum_buffer.begin(), window_sum_buffer.end(), 0.0f);
    padding_to_remove = (n_fft - hop_length) / 2;
 }
 std::vector<float> mtmd_audio_streaming_istft::process_frame(const float * frame_spectrum) {
    std::vector<float> output(hop_length);
    // copy frequencies
    for (int j = 0; j < n_fft_bins; j++) {
        ifft_in[j * 2 + 0] = frame_spectrum[j * 2 + 0];
        ifft_in[j * 2 + 1] = frame_spectrum[j * 2 + 1];
    }
    // mirror negative frequencies
    for (int j = 1; j < n_fft_bins - 1; j++) {
        int mirror_idx              = n_fft - j;
        ifft_in[mirror_idx * 2 + 0] = ifft_in[j * 2 + 0];
        ifft_in[mirror_idx * 2 + 1] = -ifft_in[j * 2 + 1];  // conjugate
    }
    ifft(cache, ifft_in.data(), n_fft, ifft_out.data());
    // update window sum and overlap buffer
    for (int j = 0; j < n_fft; j++) {
        window_sum_buffer[j] += cache.hann_window[j] * cache.hann_window[j];
        overlap_buffer[j] += ifft_out[j * 2] * cache.hann_window[j];
    }
    // extract hop_length samples with normalization
    for (int i = 0; i < hop_length; i++) {
        if (window_sum_buffer[i] > 1e-8f) {
            output[i] = overlap_buffer[i] / window_sum_buffer[i];
        } else {
            output[i] = overlap_buffer[i];
        }
    }
    // shift buffers left by hop_length
    std::copy(overlap_buffer.begin() + hop_length, overlap_buffer.end(), overlap_buffer.begin());
    std::fill(overlap_buffer.end() - hop_length, overlap_buffer.end(), 0.0f);
    std::copy(window_sum_buffer.begin() + hop_length, window_sum_buffer.end(), window_sum_buffer.begin());
    std::fill(window_sum_buffer.end() - hop_length, window_sum_buffer.end(), 0.0f);
    // Remove padding if needed
    int to_remove = std::min(padding_to_remove, (int) output.size());
    padding_to_remove -= to_remove;
    output.erase(output.begin(), output.begin() + to_remove);
    return output;
 }
 std::vector<float> mtmd_audio_streaming_istft::flush() {
    std::vector<float> output;
    // Extract remaining samples from overlap buffer
    // Continue until we've extracted all meaningful samples
    int remaining = n_fft - hop_length;
    while (remaining > 0) {
        int chunk_size = std::min(remaining, hop_length);
        for (int i = 0; i < chunk_size; i++) {
            float sample;
            if (window_sum_buffer[i] > 1e-8f) {
                sample = overlap_buffer[i] / window_sum_buffer[i];
            } else {
                sample = overlap_buffer[i];
            }
            output.push_back(sample);
        }
        // Shift buffers
        std::copy(overlap_buffer.begin() + chunk_size, overlap_buffer.end(), overlap_buffer.begin());
        std::fill(overlap_buffer.end() - chunk_size, overlap_buffer.end(), 0.0f);
        std::copy(window_sum_buffer.begin() + chunk_size, window_sum_buffer.end(), window_sum_buffer.begin());
        std::fill(window_sum_buffer.end() - chunk_size, window_sum_buffer.end(), 0.0f);
        remaining -= chunk_size;
    }
    return output;
 }
--- a/llama/llama.cpp/tools/mtmd/mtmd-audio.h
+++ b/llama/llama.cpp/tools/mtmd/mtmd-audio.h
@@ -17,38 +17,6 @@ struct mtmd_audio_mel {
    std::vector<float> data;
 };
 struct mtmd_audio_mel_filters {
    int32_t n_mel;
    int32_t n_fft;
    std::vector<float> data;
 };
 // cache for audio processing, each processor instance owns its own cache
 struct mtmd_audio_cache {
    std::vector<float> sin_vals;
    std::vector<float> cos_vals;
    std::vector<float> hann_window;
    mtmd_audio_mel_filters filters;
    void fill_sin_cos_table(int n);
    void fill_hann_window(int length, bool periodic);
    // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
    // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
    void fill_mel_filterbank_matrix(int   n_mel,
                                    int   n_fft,
                                    int   sample_rate,               // e.g. 16000
                                    float fmin             = 0.0f,   // e.g. 0.0
                                    float fmax             = -1.0f,  // e.g. sr/2; pass -1 for auto
                                    bool  slaney_area_norm = true,
                                    float scale = 1.0f  // optional extra scaling
    );
 };
 struct mtmd_audio_preprocessor {
    const clip_hparams & hparams;
@@ -63,51 +31,4 @@ struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
    mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
    void initialize() override;
    bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
  private:
    mtmd_audio_cache cache;
 };
 struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
    mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
    void initialize() override;
    bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
  private:
    mtmd_audio_cache cache;
 };
 //
 // streaming ISTFT - converts spectrogram frames back to audio one frame at a time
 //
 struct mtmd_audio_streaming_istft {
    mtmd_audio_streaming_istft(int n_fft, int hop_length);
    // reset streaming state
    void reset();
    // process a single STFT frame (streaming)
    // frame_spectrum: [n_fft_bins x 2] interleaved real/imag
    // returns: up to hop_length samples
    std::vector<float> process_frame(const float * frame_spectrum);
    // flush remaining samples at end of stream
    std::vector<float> flush();
  private:
    int n_fft;
    int hop_length;
    int n_fft_bins;
    // Own cache for output processing
    mtmd_audio_cache cache;
    // Streaming state
    std::vector<float> overlap_buffer;
    std::vector<float> window_sum_buffer;
    int                padding_to_remove;
    // Working buffers for IFFT
    std::vector<float> ifft_in;
    std::vector<float> ifft_out;
 };
--- a/llama/llama.cpp/tools/mtmd/mtmd.cpp
+++ b/llama/llama.cpp/tools/mtmd/mtmd.cpp
@@ -121,8 +121,6 @@ mtmd_context_params mtmd_context_params_default() {
        /* warmup            */ true,
        /* image_min_tokens  */ -1,
        /* image_max_tokens  */ -1,
        /* cb_eval           */ nullptr,
        /* cb_eval_user_data */ nullptr,
    };
    return params;
 }
@@ -158,6 +156,8 @@ struct mtmd_context {
    bool        tok_row_end_trail = false;
    bool        ov_img_first      = false;
    bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
    // string template for slice image delimiters with row/col (idefics3)
    std::string sli_img_start_tmpl;
@@ -188,8 +188,6 @@ struct mtmd_context {
            /* image_min_tokens  */ ctx_params.image_min_tokens,
            /* image_max_tokens  */ ctx_params.image_max_tokens,
            /* warmup            */ ctx_params.warmup,
            /* cb_eval           */ ctx_params.cb_eval,
            /* cb_eval_user_data */ ctx_params.cb_eval_user_data,
        };
        auto res = clip_init(mmproj_fname, ctx_clip_params);
@@ -229,6 +227,7 @@ struct mtmd_context {
    void init_vision() {
        GGML_ASSERT(ctx_v != nullptr);
        use_mrope = clip_is_mrope(ctx_v);
        projector_type proj = clip_get_projector_type(ctx_v);
        int minicpmv_version = clip_is_minicpmv(ctx_v);
@@ -277,7 +276,7 @@ struct mtmd_context {
        }
        // set boi/eoi
-        if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) {
+        if (proj == PROJECTOR_TYPE_GEMMA3) {
            // <start_of_image> ... (image embeddings) ... <end_of_image>
            img_beg = "<start_of_image>";
            img_end = "<end_of_image>";
@@ -294,7 +293,7 @@ struct mtmd_context {
            // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
            img_end = "[IMG_END]";
-        } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) {
+        } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL) {
            // <|vision_start|> ... (image embeddings) ... <|vision_end|>
            img_beg = "<|vision_start|>";
            img_end = "<|vision_end|>";
@@ -340,13 +339,8 @@ struct mtmd_context {
            case PROJECTOR_TYPE_QWEN25O:
            case PROJECTOR_TYPE_ULTRAVOX:
            case PROJECTOR_TYPE_VOXTRAL:
            case PROJECTOR_TYPE_GLMA:
            case PROJECTOR_TYPE_MUSIC_FLAMINGO:
                audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
                break;
            case PROJECTOR_TYPE_LFM2A:
                audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
                break;
            default:
                GGML_ABORT("unsupported audio projector type");
        }
@@ -364,9 +358,6 @@ struct mtmd_context {
            // [BEGIN_AUDIO] ... (embeddings) ...
            aud_beg = "[BEGIN_AUDIO]";
        } else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
            // <sound> ... (embeddings) ...
            aud_beg = "<sound>";
        }
    }
@@ -638,7 +629,7 @@ struct mtmd_tokenizer {
                }
                mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
-                if (mtmd_decode_use_mrope(ctx)) {
+                if (ctx->use_mrope) {
                    // for Qwen2VL, we need this information for M-RoPE decoding positions
                    image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
                    image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
@@ -873,24 +864,14 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
 }
 bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
-    switch (ctx->proj_type_v()) {
+    if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
-        case PROJECTOR_TYPE_GEMMA3:
+        return true;
            return true;
        default:
            return false;
    }
    return false;
 }
 bool mtmd_decode_use_mrope(mtmd_context * ctx) {
-    switch (ctx->proj_type_v()) {
+    return ctx->use_mrope;
        case PROJECTOR_TYPE_QWEN2VL:
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_QWEN3VL:
        case PROJECTOR_TYPE_GLM4V:
            return true;
        default:
            return false;
    }
 }
 bool mtmd_support_vision(mtmd_context * ctx) {
--- a/llama/llama.cpp/tools/mtmd/mtmd.h
+++ b/llama/llama.cpp/tools/mtmd/mtmd.h
@@ -27,9 +27,6 @@
 * - Make sure the C API is aligned with the libllama C API (as in llama.h)
 * - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead
 * - Keep the API minimal, do not expose internal details unless necessary
 *
 * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
 * We encourage human contributors to ensure the quality and reliability of the codebase.
 */
 #ifdef LLAMA_SHARED
@@ -98,10 +95,6 @@ struct mtmd_context_params {
    // limit number of image tokens, only for vision models with dynamic resolution
    int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
    int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
    // callback function passed over to mtmd proper
    ggml_backend_sched_eval_callback cb_eval;
    void * cb_eval_user_data;
 };
 MTMD_API const char * mtmd_default_marker(void);
@@ -227,7 +220,7 @@ MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
 // get output embeddings from the last encode pass
 // the reading size (in bytes) is equal to:
-// llama_model_n_embd_inp(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float)
+// llama_model_n_embd(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float)
 MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
 // Set callback for all future logging events.
@@ -280,12 +273,12 @@ struct bitmap {
        ptr.reset(mtmd_bitmap_init(nx, ny, data));
    }
    ~bitmap() = default;
-    uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); }
+    uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); }
-    uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); }
+    uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); }
-    const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); }
+    const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); }
-    size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); }
+    size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); }
-    std::string id() const { return mtmd_bitmap_get_id(ptr.get()); }
+    std::string id() { return mtmd_bitmap_get_id(ptr.get()); }
-    void set_id(const char * id) const { mtmd_bitmap_set_id(ptr.get(), id); }
+    void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
 };
 struct bitmaps {
@@ -309,8 +302,8 @@ struct input_chunks {
    input_chunks() = default;
    input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
    ~input_chunks() = default;
-    size_t size() const { return mtmd_input_chunks_size(ptr.get()); }
+    size_t size() { return mtmd_input_chunks_size(ptr.get()); }
-    const mtmd_input_chunk * operator[](size_t idx) const {
+    const mtmd_input_chunk * operator[](size_t idx) {
        return mtmd_input_chunks_get(ptr.get(), idx);
    }
 };
--- a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
+++ b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
@@ -23,7 +23,7 @@ problem.
 8 files changed, 21 insertions(+), 2 deletions(-)
 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index 354876574..9e67c769a 100644
+index 8547ecc84..9f37ca70c 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
@@ -112,7 +112,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@@ -42,7 +42,7 @@ index 354876574..9e67c769a 100644
 }
 static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-@@ -2126,6 +2126,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
+@@ -2125,6 +2125,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
 static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     GGML_ASSERT(buffer);
     ggml_aligned_free(buffer->context, buffer->size);
@@ -54,7 +54,7 @@ index 354876574..9e67c769a 100644
 }
 static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
-@@ -2178,7 +2183,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
+@@ -2177,7 +2182,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
 };
 static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
@@ -64,10 +64,10 @@ index 354876574..9e67c769a 100644
     /* .init_tensor     = */ NULL, // no initialization required
     /* .memset_tensor   = */ ggml_backend_cpu_buffer_memset_tensor,
 diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
-index 42c6c67a4..db33e0bc0 100644
+index da624c587..efc63e092 100644
 --- a/ggml/src/ggml-cann/ggml-cann.cpp
 +++ b/ggml/src/ggml-cann/ggml-cann.cpp
-@@ -820,6 +820,7 @@ static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
+@@ -831,6 +831,7 @@ static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
 static void ggml_backend_cann_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
     delete ctx;
@@ -75,7 +75,7 @@ index 42c6c67a4..db33e0bc0 100644
 }
 /**
-@@ -1559,6 +1560,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
+@@ -1570,6 +1571,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
  */
 static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
     ACL_CHECK(aclrtFreeHost(buffer->context));
@@ -84,7 +84,7 @@ index 42c6c67a4..db33e0bc0 100644
 /**
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index e9df0ea4a..290d762ad 100644
+index ab0f6fe9c..6519af435 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -583,6 +583,7 @@ struct ggml_backend_cuda_buffer_context {
@@ -112,7 +112,7 @@ index e9df0ea4a..290d762ad 100644
 static void * ggml_cuda_host_malloc(size_t size) {
 diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
-index 56b59f0af..790cabca0 100644
+index 70bf6f3d9..f2b7fe692 100644
 --- a/ggml/src/ggml-metal/ggml-metal.cpp
 +++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -25,6 +25,7 @@ static void ggml_backend_metal_buffer_shared_free_buffer(ggml_backend_buffer_t b
@@ -132,10 +132,10 @@ index 56b59f0af..790cabca0 100644
 static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
-index 678e40965..0b3914ce6 100644
+index 0d37587f6..ff373d413 100644
 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp
 +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
-@@ -3675,6 +3675,7 @@ struct ggml_backend_opencl_buffer_context {
+@@ -3417,6 +3417,7 @@ struct ggml_backend_opencl_buffer_context {
 static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
     delete ctx;
@@ -144,10 +144,10 @@ index 678e40965..0b3914ce6 100644
 static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
-index d7c8ad8c1..281fa1bdb 100644
+index 18a45d2d9..89041805e 100644
 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp
 +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
-@@ -557,6 +557,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -556,6 +556,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
     RPC_STATUS_ASSERT(status);
     delete ctx;
@@ -156,7 +156,7 @@ index d7c8ad8c1..281fa1bdb 100644
 static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
-index ce2f0d41c..3d5924105 100644
+index e996d98be..84b679315 100644
 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp
 +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -356,6 +356,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
@@ -175,19 +175,19 @@ index ce2f0d41c..3d5924105 100644
 }
 static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -1175,6 +1177,7 @@ inline void free_aligned_mem_host(void * memblock) {
+@@ -1159,6 +1161,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
 static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-     free_aligned_mem_host((void *)buffer->context);
+     ggml_sycl_host_free(buffer->context);
 +    delete buffer;
 }
 static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index b5e5dba95..cc9b38b54 100644
+index 34ec09d40..120191ca0 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -12859,6 +12859,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -12365,6 +12365,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
     ggml_vk_destroy_buffer(ctx->dev_buffer);
     delete ctx;
@@ -195,7 +195,7 @@ index b5e5dba95..cc9b38b54 100644
 }
 static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -13002,6 +13003,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
+@@ -12508,6 +12509,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
 static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
     ggml_vk_host_free(vk_instance.devices[0], buffer->context);
--- a/llama/patches/0002-pretokenizer.patch
+++ b/llama/patches/0002-pretokenizer.patch
@@ -6,14 +6,14 @@ Subject: [PATCH] pretokenizer
 allow for an unset pretokenizer with a warning in the
 logs instead of throwing an error
 ---
- src/llama-vocab.cpp | 17 +++++------------
+ src/llama-vocab.cpp | 14 +++-----------
- 1 file changed, 5 insertions(+), 12 deletions(-)
+ 1 file changed, 3 insertions(+), 11 deletions(-)
 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index a23950d00..886ed637d 100644
+index 7b01a2edf..63250cdf1 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1839,16 +1839,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1825,16 +1825,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
         if (type == LLAMA_VOCAB_TYPE_BPE) {
             add_space_prefix = false;
             clean_spaces = true;
@@ -31,8 +31,8 @@ index a23950d00..886ed637d 100644
                 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
-@@ -2042,7 +2033,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -2015,7 +2006,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
-                 pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
+                 pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
                 clean_spaces = false;
             } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
@@ -41,20 +41,3 @@ index a23950d00..886ed637d 100644
             }
         } else if (type == LLAMA_VOCAB_TYPE_SPM) {
             pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
@@ -2086,6 +2078,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
         scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
     }
 +    const uint32_t n_scores = score_idx != -1 ? gguf_get_arr_n(ctx, score_idx) : 0;
     const int * toktypes = nullptr;
     const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
     if (toktype_idx != -1) {
@@ -2107,7 +2100,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
         auto & token_data = id_to_token[i];
         token_data.text  = std::move(word);
 -        token_data.score = scores ? scores[i] : 0.0f;
 +        token_data.score = (scores && i < n_scores) ? scores[i] : 0.0f;
         token_data.attr  = LLAMA_TOKEN_ATTR_NORMAL;
         if (toktypes) {  //TODO: remove, required until per token attributes are available from GGUF file
--- a/llama/patches/0003-clip-unicode.patch
+++ b/llama/patches/0003-clip-unicode.patch
@@ -6,11 +6,11 @@ Subject: [PATCH] clip-unicode
 fixes loading vision models in llama.cpp on windows
 filesystems for paths that include wide characters
 ---
- tools/mtmd/clip.cpp | 47 +++++++++++++++++++++++++++++++++++++++++----
+ tools/mtmd/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++
- 1 file changed, 43 insertions(+), 4 deletions(-)
+ 1 file changed, 39 insertions(+)
 diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
-index 9b076e0c5..18dab19df 100644
+index 35e3aef0a..84a3796b5 100644
 --- a/tools/mtmd/clip.cpp
 +++ b/tools/mtmd/clip.cpp
@@ -24,6 +24,19 @@
@@ -33,7 +33,7 @@ index 9b076e0c5..18dab19df 100644
 struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
 //#define CLIP_DEBUG_FUNCTIONS
-@@ -1837,7 +1850,29 @@ struct clip_model_loader {
+@@ -1619,7 +1632,29 @@ struct clip_model_loader {
         {
             std::vector<uint8_t> read_buf;
@@ -63,7 +63,7 @@ index 9b076e0c5..18dab19df 100644
             if (!fin) {
                 throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
             }
-@@ -1864,7 +1899,11 @@ struct clip_model_loader {
+@@ -1646,7 +1681,11 @@ struct clip_model_loader {
                     ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
                 }
             }
@@ -75,39 +75,3 @@ index 9b076e0c5..18dab19df 100644
             LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str());
         }
@@ -2247,7 +2286,7 @@ struct img_tool {
             std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
         dst.nx = target_resolution.width;
         dst.ny = target_resolution.height;
 -        dst.buf.resize(3 * dst.nx * dst.ny);
 +        dst.buf.resize(3 * static_cast<size_t>(dst.nx) * static_cast<size_t>(dst.ny));
         if (dst.nx == src.nx && dst.ny == src.ny) {
             // no resize needed, simple copy
@@ -2300,7 +2339,7 @@ struct img_tool {
     static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
         dst.nx = w;
         dst.ny = h;
 -        dst.buf.resize(3 * w * h);
 +        dst.buf.resize(3 * static_cast<size_t>(w) * static_cast<size_t>(h));
         for (int i = 0; i < h; ++i) {
             for (int j = 0; j < w; ++j) {
@@ -2397,7 +2436,7 @@ private:
     static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
         dst.nx = target_width;
         dst.ny = target_height;
 -        dst.buf.resize(3 * target_width * target_height);
 +        dst.buf.resize(3 * static_cast<size_t>(target_width) * static_cast<size_t>(target_height));
         float x_ratio = static_cast<float>(src.nx - 1) / target_width;
         float y_ratio = static_cast<float>(src.ny - 1) / target_height;
@@ -2436,7 +2475,7 @@ private:
         dst.nx = target_width;
         dst.ny = target_height;
 -        dst.buf.resize(3 * target_width * target_height);
 +        dst.buf.resize(3 * static_cast<size_t>(target_width) * static_cast<size_t>(target_height));
         float Cc;
         float C[5] = {};
--- a/llama/patches/0004-solar-pro.patch
+++ b/llama/patches/0004-solar-pro.patch
@@ -19,10 +19,10 @@ adds support for the Solar Pro architecture
 create mode 100644 src/models/solar.cpp
 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
-index f337afd6b..b08cd324d 100644
+index 4192af7c0..bd44d73e7 100644
 --- a/src/CMakeLists.txt
 +++ b/src/CMakeLists.txt
-@@ -131,6 +131,7 @@ add_library(llama
+@@ -125,6 +125,7 @@ add_library(llama
             models/seed-oss.cpp
             models/smallthinker.cpp
             models/smollm3.cpp
@@ -31,10 +31,10 @@ index f337afd6b..b08cd324d 100644
             models/starcoder.cpp
             models/starcoder2.cpp
 diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index a54bc1956..a62a03e14 100644
+index 8caf80afc..2ce8ffec0 100644
 --- a/src/llama-arch.cpp
 +++ b/src/llama-arch.cpp
-@@ -90,6 +90,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+@@ -87,6 +87,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_GRANITE_MOE,      "granitemoe"       },
     { LLM_ARCH_GRANITE_HYBRID,   "granitehybrid"    },
     { LLM_ARCH_CHAMELEON,        "chameleon"        },
@@ -42,7 +42,7 @@ index a54bc1956..a62a03e14 100644
     { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
     { LLM_ARCH_PLM,              "plm"              },
     { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
-@@ -216,6 +217,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+@@ -208,6 +209,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_OUTPUT_SCALE,                 "%s.attention.output_scale"                 },
     { LLM_KV_ATTENTION_TEMPERATURE_LENGTH,           "%s.attention.temperature_length"           },
     { LLM_KV_ATTENTION_TEMPERATURE_SCALE,            "%s.attention.temperature_scale"            },
@@ -50,7 +50,7 @@ index a54bc1956..a62a03e14 100644
     { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
     { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
-@@ -348,6 +350,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
+@@ -339,6 +341,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_ATTN_QKV,                               "blk.%d.attn_qkv" },
     { LLM_TENSOR_LAYER_OUT_NORM,                         "blk.%d.layer_output_norm" },
     { LLM_TENSOR_ATTN_OUT_NORM,                          "blk.%d.attn_output_norm" },
@@ -58,9 +58,9 @@ index a54bc1956..a62a03e14 100644
     { LLM_TENSOR_POS_EMBD,                               "position_embd" },
     { LLM_TENSOR_FFN_ACT,                                "blk.%d.ffn.act" },
     { LLM_TENSOR_TOKEN_EMBD_NORM,                        "token_embd_norm" },
-@@ -2289,6 +2292,22 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
+@@ -2176,6 +2179,22 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
-                 LLM_TENSOR_FFN_DOWN,
+             return {
-                 LLM_TENSOR_FFN_UP,
+                 LLM_TENSOR_TOKEN_EMBD,
             };
 +        case LLM_ARCH_SOLAR:
 +            return {
@@ -81,7 +81,7 @@ index a54bc1956..a62a03e14 100644
         default:
             GGML_ABORT("unknown architecture for tensor mapping");
     }
-@@ -2457,6 +2476,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
+@@ -2344,6 +2363,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_LAUREL_POST_NORM,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@@ -90,10 +90,10 @@ index a54bc1956..a62a03e14 100644
     {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 diff --git a/src/llama-arch.h b/src/llama-arch.h
-index 270d28b16..d96470a0d 100644
+index 6cbf9b1f8..14d461c76 100644
 --- a/src/llama-arch.h
 +++ b/src/llama-arch.h
-@@ -94,6 +94,7 @@ enum llm_arch {
+@@ -91,6 +91,7 @@ enum llm_arch {
     LLM_ARCH_GRANITE_MOE,
     LLM_ARCH_GRANITE_HYBRID,
     LLM_ARCH_CHAMELEON,
@@ -101,7 +101,7 @@ index 270d28b16..d96470a0d 100644
     LLM_ARCH_WAVTOKENIZER_DEC,
     LLM_ARCH_PLM,
     LLM_ARCH_BAILINGMOE,
-@@ -220,6 +221,7 @@ enum llm_kv {
+@@ -212,6 +213,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_OUTPUT_SCALE,
     LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
     LLM_KV_ATTENTION_TEMPERATURE_SCALE,
@@ -109,7 +109,7 @@ index 270d28b16..d96470a0d 100644
     LLM_KV_ATTENTION_KEY_LENGTH_MLA,
     LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
-@@ -474,6 +476,7 @@ enum llm_tensor {
+@@ -465,6 +467,7 @@ enum llm_tensor {
     LLM_TENSOR_ENC_OUTPUT_NORM,
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
@@ -118,10 +118,10 @@ index 270d28b16..d96470a0d 100644
     LLM_TENSOR_CONVNEXT_DW,
     LLM_TENSOR_CONVNEXT_NORM,
 diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
-index 392f9160c..14e089efb 100644
+index fe1fa4341..aabff2f06 100644
 --- a/src/llama-hparams.cpp
 +++ b/src/llama-hparams.cpp
-@@ -167,6 +167,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
+@@ -163,6 +163,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
     return rope_type == LLAMA_ROPE_TYPE_MROPE || rope_type == LLAMA_ROPE_TYPE_IMROPE ? 4 : 1;
 }
@@ -137,10 +137,10 @@ index 392f9160c..14e089efb 100644
     if (il < n_layer) {
         return swa_layers[il];
 diff --git a/src/llama-hparams.h b/src/llama-hparams.h
-index caed0ec1b..61a1fbef6 100644
+index f6e95b5d2..c6e673276 100644
 --- a/src/llama-hparams.h
 +++ b/src/llama-hparams.h
-@@ -66,6 +66,8 @@ struct llama_hparams {
+@@ -65,6 +65,8 @@ struct llama_hparams {
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
@@ -149,7 +149,7 @@ index caed0ec1b..61a1fbef6 100644
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q           = 0;
     uint32_t n_lora_kv          = 0;
-@@ -267,6 +269,9 @@ struct llama_hparams {
+@@ -259,6 +261,9 @@ struct llama_hparams {
     uint32_t n_pos_per_embd() const;
@@ -158,12 +158,12 @@ index caed0ec1b..61a1fbef6 100644
 +
     bool is_swa(uint32_t il) const;
-     // note: currently only support if either all or none of the layers are MLA
+     bool has_kv(uint32_t il) const;
 diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
-index 383b8dc76..c2e758737 100644
+index ca2ea2461..8916a6242 100644
 --- a/src/llama-model-loader.cpp
 +++ b/src/llama-model-loader.cpp
-@@ -497,7 +497,7 @@ namespace GGUFMeta {
+@@ -466,7 +466,7 @@ namespace GGUFMeta {
     template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
     template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
     template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
@@ -173,10 +173,10 @@ index 383b8dc76..c2e758737 100644
 llama_model_loader::llama_model_loader(
         const std::string & fname,
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index cc784e1cb..c093207e0 100644
+index ae8207ee1..00cd579e0 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -2114,6 +2114,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -1995,6 +1995,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                }
             } break;
@@ -198,7 +198,7 @@ index cc784e1cb..c093207e0 100644
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-@@ -5741,6 +5756,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -5429,6 +5444,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@@ -233,7 +233,7 @@ index cc784e1cb..c093207e0 100644
                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-@@ -7981,6 +8024,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
+@@ -7534,6 +7577,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             {
                 llm = std::make_unique<llm_build_chameleon>(*this, params);
             } break;
@@ -244,7 +244,7 @@ index cc784e1cb..c093207e0 100644
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
-@@ -8259,6 +8306,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+@@ -7798,6 +7845,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_GRANITE_MOE:
         case LLM_ARCH_GRANITE_HYBRID:
         case LLM_ARCH_CHAMELEON:
@@ -253,10 +253,10 @@ index cc784e1cb..c093207e0 100644
         case LLM_ARCH_NEO_BERT:
         case LLM_ARCH_SMOLLM3:
 diff --git a/src/llama-model.h b/src/llama-model.h
-index d1de16e3f..e8452eda5 100644
+index c6eb95318..b378b23ec 100644
 --- a/src/llama-model.h
 +++ b/src/llama-model.h
-@@ -80,6 +80,7 @@ enum llm_type {
+@@ -76,6 +76,7 @@ enum llm_type {
     LLM_TYPE_15B,
     LLM_TYPE_16B,
     LLM_TYPE_20B,
@@ -264,7 +264,7 @@ index d1de16e3f..e8452eda5 100644
     LLM_TYPE_26B,
     LLM_TYPE_27B,
     LLM_TYPE_30B,
-@@ -411,6 +412,8 @@ struct llama_layer {
+@@ -405,6 +406,8 @@ struct llama_layer {
     struct ggml_tensor * ffn_act_beta    = nullptr;
     struct ggml_tensor * ffn_act_eps     = nullptr;
@@ -274,10 +274,10 @@ index d1de16e3f..e8452eda5 100644
     struct llama_layer_convnext convnext;
 diff --git a/src/models/models.h b/src/models/models.h
-index 3a44f7f14..eabe9c81c 100644
+index ffb36acc6..6d84a185d 100644
 --- a/src/models/models.h
 +++ b/src/models/models.h
-@@ -544,6 +544,11 @@ struct llm_build_smollm3 : public llm_graph_context {
+@@ -515,6 +515,11 @@ struct llm_build_smollm3 : public llm_graph_context {
     llm_build_smollm3(const llama_model & model, const llm_graph_params & params);
 };
--- a/llama/patches/0005-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0005-fix-deepseek-deseret-regex.patch
@@ -12,7 +12,7 @@ regex
 2 files changed, 22 insertions(+), 1 deletion(-)
 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 886ed637d..923e850cb 100644
+index 63250cdf1..dd86a1745 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
@@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
@@ -25,7 +25,7 @@ index 886ed637d..923e850cb 100644
                     "\\s+$",
                     "[一-龥ࠀ-一가-퟿]+",
 diff --git a/src/unicode.cpp b/src/unicode.cpp
-index b47dcbe61..6d1084f26 100644
+index bb44edfad..13ced055f 100644
 --- a/src/unicode.cpp
 +++ b/src/unicode.cpp
@@ -2,6 +2,11 @@
--- a/llama/patches/0007-sort-devices-by-score.patch
+++ b/llama/patches/0007-sort-devices-by-score.patch
@@ -11,10 +11,10 @@ with the fastest acceleration is loaded
 1 file changed, 13 insertions(+), 8 deletions(-)
 diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 6bee1bc4b..f3d371dcc 100644
+index 4181a714a..079dba211 100644
 --- a/ggml/src/ggml-backend-reg.cpp
 +++ b/ggml/src/ggml-backend-reg.cpp
-@@ -167,7 +167,7 @@ struct ggml_backend_reg_entry {
+@@ -183,7 +183,7 @@ struct ggml_backend_reg_entry {
 struct ggml_backend_registry {
     std::vector<ggml_backend_reg_entry> backends;
@@ -23,7 +23,7 @@ index 6bee1bc4b..f3d371dcc 100644
     ggml_backend_registry() {
 #ifdef GGML_USE_CUDA
-@@ -221,7 +221,7 @@ struct ggml_backend_registry {
+@@ -237,7 +237,7 @@ struct ggml_backend_registry {
         }
     }
@@ -32,7 +32,7 @@ index 6bee1bc4b..f3d371dcc 100644
         if (!reg) {
             return;
         }
-@@ -232,15 +232,20 @@ struct ggml_backend_registry {
+@@ -248,15 +248,20 @@ struct ggml_backend_registry {
 #endif
         backends.push_back({ reg, std::move(handle) });
         for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
@@ -56,7 +56,7 @@ index 6bee1bc4b..f3d371dcc 100644
     }
     ggml_backend_reg_t load_backend(const fs::path & path, bool silent) {
-@@ -284,7 +289,7 @@ struct ggml_backend_registry {
+@@ -300,7 +305,7 @@ struct ggml_backend_registry {
         GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str());
@@ -65,7 +65,7 @@ index 6bee1bc4b..f3d371dcc 100644
         return reg;
     }
-@@ -307,7 +312,7 @@ struct ggml_backend_registry {
+@@ -323,7 +328,7 @@ struct ggml_backend_registry {
         // remove devices
         devices.erase(
             std::remove_if(devices.begin(), devices.end(),
@@ -74,7 +74,7 @@ index 6bee1bc4b..f3d371dcc 100644
             devices.end());
         // remove backend
-@@ -365,7 +370,7 @@ size_t ggml_backend_dev_count() {
+@@ -381,7 +386,7 @@ size_t ggml_backend_dev_count() {
 ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
     GGML_ASSERT(index < ggml_backend_dev_count());
--- a/llama/patches/0008-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+++ b/llama/patches/0008-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
 1 file changed, 2 insertions(+)
 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 6192a8704..993ec027f 100644
+index 4c04c3300..f4747f262 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
@@ -345,6 +345,7 @@ function(ggml_add_cpu_backend_variant tag_name)
@@ -26,4 +26,4 @@ index 6192a8704..993ec027f 100644
 +    add_custom_target(ggml-cpu)
     if (GGML_SYSTEM_ARCH STREQUAL "x86")
         ggml_add_cpu_backend_variant(x64)
-         ggml_add_cpu_backend_variant(sse42              SSE42)
+         ggml_add_cpu_backend_variant(sse42        SSE42)
--- a/llama/patches/0009-remove-amx.patch
+++ b/llama/patches/0009-remove-amx.patch
@@ -5,22 +5,21 @@ Subject: [PATCH] remove amx
 disable amx as it reduces performance on some systems
 ---
- ggml/src/CMakeLists.txt | 5 +----
+ ggml/src/CMakeLists.txt | 4 ----
- 1 file changed, 1 insertion(+), 4 deletions(-)
+ 1 file changed, 4 deletions(-)
 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 993ec027f..cbda1380c 100644
+index f4747f262..d55aed348 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
-@@ -379,10 +379,7 @@ if (GGML_CPU_ALL_VARIANTS)
+@@ -365,10 +365,6 @@ if (GGML_CPU_ALL_VARIANTS)
-             ggml_add_cpu_backend_variant(zen4           SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16)
+         ggml_add_cpu_backend_variant(skylakex     SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
-         endif()
+         ggml_add_cpu_backend_variant(icelake      SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
-         ggml_add_cpu_backend_variant(alderlake          SSE42 AVX F16C FMA AVX2 BMI2 AVX_VNNI)
+         ggml_add_cpu_backend_variant(alderlake    SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
 -        if (NOT MSVC)
 -            # MSVC doesn't support AMX
-            ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
+-            ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
 -        endif()
 +        # AMX variants removed by ollama - sapphirerapids with AMX_TILE AMX_INT8 not included
     elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
         if (CMAKE_SYSTEM_NAME MATCHES "Linux")
             # Many of these features are optional so we build versions with popular
--- a/llama/patches/0010-fix-string-arr-kv-loading.patch
+++ b/llama/patches/0010-fix-string-arr-kv-loading.patch
@@ -25,10 +25,10 @@ index 79ee20206..3efb22f01 100644
     // get ith C string from array with given key_id
     GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
 diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
-index ed0d7f2ca..db55f6ed1 100644
+index b165d8bdc..f91d4faba 100644
 --- a/ggml/src/gguf.cpp
 +++ b/ggml/src/gguf.cpp
-@@ -813,10 +813,14 @@ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id
+@@ -805,10 +805,14 @@ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id
 const void * gguf_get_arr_data(const struct gguf_context * ctx, int64_t key_id) {
     GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
@@ -44,7 +44,7 @@ index ed0d7f2ca..db55f6ed1 100644
 const char * gguf_get_arr_str(const struct gguf_context * ctx, int64_t key_id, size_t i) {
     GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
     GGML_ASSERT(ctx->kv[key_id].get_type() == GGUF_TYPE_STRING);
-@@ -910,7 +914,6 @@ const char * gguf_get_val_str(const struct gguf_context * ctx, int64_t key_id) {
+@@ -902,7 +906,6 @@ const char * gguf_get_val_str(const struct gguf_context * ctx, int64_t key_id) {
 const void * gguf_get_val_data(const struct gguf_context * ctx, int64_t key_id) {
     GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
     GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
@@ -53,10 +53,10 @@ index ed0d7f2ca..db55f6ed1 100644
 }
 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 923e850cb..0917191b5 100644
+index dd86a1745..d63ce9c84 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1795,9 +1795,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1781,9 +1781,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
             if (precompiled_charsmap_keyidx != -1) {
                 const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
--- a/llama/patches/0011-ollama-debug-tensor.patch
+++ b/llama/patches/0011-ollama-debug-tensor.patch
@@ -8,19 +8,19 @@ Subject: [PATCH] ollama debug tensor
 1 file changed, 6 insertions(+)
 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index b1de2ae87..42e892527 100644
+index a59b51893..53891a91f 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
-@@ -16,6 +16,8 @@
+@@ -15,6 +15,8 @@
 #include "ops.h"
 #include "ggml.h"
 #include "common.h"
 +#include "ollama-debug.h"
 +
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
 #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
-@@ -2952,6 +2954,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
+@@ -2945,6 +2947,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         ggml_compute_forward(&params, node);
--- a/llama/patches/0012-add-ollama-vocab-for-grammar-support.patch
+++ b/llama/patches/0012-add-ollama-vocab-for-grammar-support.patch
@@ -10,10 +10,10 @@ Subject: [PATCH] add ollama vocab for grammar support
 3 files changed, 58 insertions(+), 10 deletions(-)
 diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
-index 64ea2fd00..d87e52ded 100644
+index 75d5d750c..a0299d181 100644
 --- a/src/llama-grammar.cpp
 +++ b/src/llama-grammar.cpp
-@@ -1079,6 +1079,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
+@@ -1041,6 +1041,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
 struct llama_grammar * llama_grammar_init_impl(
         const struct llama_vocab * vocab,
@@ -21,7 +21,7 @@ index 64ea2fd00..d87e52ded 100644
         const llama_grammar_element ** rules,
         size_t n_rules,
         size_t start_rule_index) {
-@@ -1134,6 +1135,7 @@ struct llama_grammar * llama_grammar_init_impl(
+@@ -1096,6 +1097,7 @@ struct llama_grammar * llama_grammar_init_impl(
     // then the pointers would be invalidated when the local vec_rules goes out of scope.
     return new llama_grammar {
         vocab,
@@ -29,7 +29,7 @@ index 64ea2fd00..d87e52ded 100644
         std::move(vec_rules),
         std::move(stacks),
         /* .partial_utf8 = */             {},
-@@ -1148,6 +1150,7 @@ struct llama_grammar * llama_grammar_init_impl(
+@@ -1110,6 +1112,7 @@ struct llama_grammar * llama_grammar_init_impl(
 struct llama_grammar * llama_grammar_init_impl(
         const struct llama_vocab * vocab,
@@ -37,7 +37,7 @@ index 64ea2fd00..d87e52ded 100644
                       const char * grammar_str,
                       const char * grammar_root,
                               bool lazy,
-@@ -1240,6 +1243,7 @@ struct llama_grammar * llama_grammar_init_impl(
+@@ -1202,6 +1205,7 @@ struct llama_grammar * llama_grammar_init_impl(
     // then the pointers would be invalidated when the local vec_rules goes out of scope.
     return new llama_grammar {
         vocab,
@@ -45,7 +45,7 @@ index 64ea2fd00..d87e52ded 100644
         std::move(vec_rules),
         std::move(stacks),
         /* .partial_utf8 = */             {},
-@@ -1263,6 +1267,7 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
+@@ -1225,6 +1229,7 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
 struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
     auto * result = new llama_grammar {
         grammar.vocab,
@@ -53,7 +53,7 @@ index 64ea2fd00..d87e52ded 100644
         grammar.rules,
         grammar.stacks,
         grammar.partial_utf8,
-@@ -1291,7 +1296,6 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
+@@ -1253,7 +1258,6 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
 }
 void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * cur_p) {
@@ -61,7 +61,7 @@ index 64ea2fd00..d87e52ded 100644
     if (grammar.awaiting_trigger) {
         return;
-@@ -1313,9 +1317,13 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
+@@ -1275,9 +1279,13 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
     for (size_t i = 0; i < cur_p->size; ++i) {
         const llama_token id      = cur_p->data[i].id;
@@ -77,7 +77,7 @@ index 64ea2fd00..d87e52ded 100644
             if (!allow_eog) {
                 cur_p->data[i].logit = -INFINITY;
             }
-@@ -1334,9 +1342,10 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
+@@ -1296,9 +1304,10 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
 }
 void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
@@ -90,7 +90,7 @@ index 64ea2fd00..d87e52ded 100644
     if (grammar.awaiting_trigger) {
         if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
-@@ -1380,13 +1389,14 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
+@@ -1353,13 +1362,14 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
         }
     }
@@ -107,7 +107,7 @@ index 64ea2fd00..d87e52ded 100644
     }
     llama_grammar_accept_token(grammar, token, piece);
-@@ -1462,3 +1472,27 @@ void llama_grammar_accept_token(struct llama_grammar & grammar, llama_token toke
+@@ -1435,3 +1445,27 @@ void llama_grammar_accept_token(struct llama_grammar & grammar, llama_token toke
     }
 }
@@ -136,7 +136,7 @@ index 64ea2fd00..d87e52ded 100644
 +    }
 +}
 diff --git a/src/llama-grammar.h b/src/llama-grammar.h
-index b5a0e588e..57847583a 100644
+index a4c978ac1..5c0da4049 100644
 --- a/src/llama-grammar.h
 +++ b/src/llama-grammar.h
@@ -6,8 +6,19 @@
@@ -159,7 +159,7 @@ index b5a0e588e..57847583a 100644
 // grammar element type
 enum llama_gretype {
-@@ -129,6 +140,7 @@ struct llama_grammar {
+@@ -127,6 +138,7 @@ struct llama_grammar {
     // note: allow null vocab for testing (not great)
     const llama_vocab * vocab;
@@ -167,7 +167,7 @@ index b5a0e588e..57847583a 100644
     const llama_grammar_rules  rules;  // TODO: shared ptr
           llama_grammar_stacks stacks;
-@@ -157,12 +169,14 @@ struct llama_grammar {
+@@ -155,12 +167,14 @@ struct llama_grammar {
 // note: needed for tests (not great)
 struct llama_grammar * llama_grammar_init_impl(
         const struct llama_vocab * vocab,
@@ -183,10 +183,10 @@ index b5a0e588e..57847583a 100644
                       const char * grammar_root,
                               bool lazy,
 diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
-index 5dde51306..90f6f1b3d 100644
+index 3f4a729bc..38a30ea05 100644
 --- a/src/llama-sampling.cpp
 +++ b/src/llama-sampling.cpp
-@@ -2504,7 +2504,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
+@@ -1561,7 +1561,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
         trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
     }
@@ -195,7 +195,7 @@ index 5dde51306..90f6f1b3d 100644
                                                  ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
                                                  ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
-@@ -2586,9 +2586,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
+@@ -1639,9 +1639,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
             trigger_pattern += ")[\\s\\S]*";
             std::array<const char *, 1> tmp_trigger_patterns = { trigger_pattern.c_str() };
--- a/llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch
+++ b/llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch
@@ -5,17 +5,17 @@ Subject: [PATCH] add argsort and cuda copy for i32
 ---
 ggml/src/ggml-cpu/ops.cpp            |  43 ++++++
- ggml/src/ggml-cuda/argsort.cu        | 120 +++++++++++++--
+ ggml/src/ggml-cuda/argsort.cu        | 122 +++++++++++++--
 ggml/src/ggml-cuda/cpy-utils.cuh     |   6 +
 ggml/src/ggml-cuda/cpy.cu            |  40 +++++
 ggml/src/ggml-metal/ggml-metal.metal | 215 +++++++++++++++++++++++++++
- 5 files changed, 413 insertions(+), 11 deletions(-)
+ 5 files changed, 414 insertions(+), 12 deletions(-)
 diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
-index 48c896436..c08e73f3c 100644
+index 303278397..7d1733adb 100644
 --- a/ggml/src/ggml-cpu/ops.cpp
 +++ b/ggml/src/ggml-cpu/ops.cpp
-@@ -7958,6 +7958,45 @@ static void ggml_compute_forward_argsort_f32(
+@@ -7932,6 +7932,45 @@ static void ggml_compute_forward_argsort_f32(
     }
 }
@@ -61,7 +61,7 @@ index 48c896436..c08e73f3c 100644
 void ggml_compute_forward_argsort(
     const ggml_compute_params * params,
     ggml_tensor * dst) {
-@@ -7969,6 +8008,10 @@ void ggml_compute_forward_argsort(
+@@ -7943,6 +7982,10 @@ void ggml_compute_forward_argsort(
             {
                 ggml_compute_forward_argsort_f32(params, dst);
             } break;
@@ -73,10 +73,10 @@ index 48c896436..c08e73f3c 100644
             {
                 GGML_ABORT("fatal error");
 diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
-index 4896669c3..6fae8b808 100644
+index da9652c3b..b82be371c 100644
 --- a/ggml/src/ggml-cuda/argsort.cu
 +++ b/ggml/src/ggml-cuda/argsort.cu
-@@ -198,13 +198,107 @@ void argsort_f32_i32_cuda_bitonic(const float *   x,
+@@ -168,13 +168,107 @@ static void argsort_f32_i32_cuda_bitonic(const float *   x,
     }
 }
@@ -185,27 +185,28 @@ index 4896669c3..6fae8b808 100644
     GGML_ASSERT( dst->type == GGML_TYPE_I32);
     GGML_ASSERT(ggml_is_contiguous(src0));
-@@ -213,18 +307,22 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+@@ -183,18 +277,22 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
-+    if (src0->type == GGML_TYPE_I32) {
+-#ifdef GGML_CUDA_USE_CUB
 +        argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
 +    } else {
 #ifdef GGML_CUDA_USE_CUB
 -    const int    ncols_pad      = next_power_of_2(ncols);
 -    const size_t shared_mem     = ncols_pad * sizeof(int);
 -    const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
-+        const int    ncols_pad      = next_power_of_2(ncols);
+-
 +        const size_t shared_mem     = ncols_pad * sizeof(int);
 +        const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
 -    if (shared_mem > max_shared_mem || ncols > 1024) {
 -        ggml_cuda_pool & pool = ctx.pool();
 -        argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, nrows, order, stream);
-    } else {
+    if (src0->type == GGML_TYPE_I32) {
 +        argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
     } else {
 -        argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
 -    }
 +#ifdef GGML_CUDA_USE_CUB
 +        const int    ncols_pad      = next_power_of_2(ncols);
 +        const size_t shared_mem     = ncols_pad * sizeof(int);
 +        const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
 +
 +        if (shared_mem > max_shared_mem || ncols > 1024) {
 +            ggml_cuda_pool & pool = ctx.pool();
 +            argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, nrows, order, stream);
@@ -233,10 +234,10 @@ index 7697c292d..00d773dd3 100644
 +    *dst = *src;
 +}
 diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
-index ee84303ef..178e82d76 100644
+index c4ceb4fc5..0e53ecc39 100644
 --- a/ggml/src/ggml-cuda/cpy.cu
 +++ b/ggml/src/ggml-cuda/cpy.cu
-@@ -369,6 +369,43 @@ static void ggml_cpy_f32_iq4_nl_cuda(
+@@ -352,6 +352,43 @@ static void ggml_cpy_f32_iq4_nl_cuda(
         (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
@@ -280,7 +281,7 @@ index ee84303ef..178e82d76 100644
 void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
     const int64_t ne = ggml_nelements(src0);
     GGML_ASSERT(ne == ggml_nelements(src1));
-@@ -495,6 +532,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
+@@ -481,6 +518,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
             ggml_cpy_scalar_cuda<half, float>
                 (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
         }
@@ -291,10 +292,10 @@ index ee84303ef..178e82d76 100644
         if (can_be_transposed) {
             ggml_cpy_scalar_cuda<nv_bfloat16, nv_bfloat16, true>
 diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index 17e358d1a..2e463bd99 100644
+index 51bcbae30..236838e9e 100644
 --- a/ggml/src/ggml-metal/ggml-metal.metal
 +++ b/ggml/src/ggml-metal/ggml-metal.metal
-@@ -4955,8 +4955,77 @@ kernel void kernel_argsort_f32_i32(
+@@ -4954,8 +4954,77 @@ kernel void kernel_argsort_f32_i32(
     }
 }
@@ -372,7 +373,7 @@ index 17e358d1a..2e463bd99 100644
 typedef void (argsort_merge_t)(
         constant   ggml_metal_kargs_argsort_merge & args,
-@@ -5111,8 +5180,154 @@ kernel void kernel_argsort_merge_f32_i32(
+@@ -5110,8 +5179,154 @@ kernel void kernel_argsort_merge_f32_i32(
     }
 }
--- a/llama/patches/0014-graph-memory-reporting-on-failure.patch
+++ b/llama/patches/0014-graph-memory-reporting-on-failure.patch
@@ -23,7 +23,7 @@ index 78aa059dd..7fa8403b3 100644
 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index a9d177864..393c329be 100644
+index 4ed5f3577..a7ebe5dcd 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
@@ -319,6 +319,7 @@ extern "C" {
@@ -121,7 +121,7 @@ index 41419b617..73b39bfea 100644
 static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index 9e67c769a..20b37a0b3 100644
+index 9f37ca70c..1459d16dd 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
@@ -1859,6 +1859,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
--- a/llama/patches/0015-ggml-Export-GPU-UUIDs.patch
+++ b/llama/patches/0015-ggml-Export-GPU-UUIDs.patch
@@ -1,32 +1,31 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: nobody <>
+From: Daniel Hiltgen <daniel@ollama.com>
-Date: Sat, 10 Jan 2026 15:27:57 -0800
+Date: Sun, 30 Nov 2025 11:05:56 -0800
 Subject: [PATCH] ggml: Export GPU UUIDs
 ---
- ggml/include/ggml-backend.h        |  2 +
+ ggml/include/ggml-backend.h        |  1 +
- ggml/src/ggml-cuda/ggml-cuda.cu    | 72 +++++++++++++++++++++++++++---
+ ggml/src/ggml-cuda/ggml-cuda.cu    | 67 +++++++++++++++++++++++++++---
 ggml/src/ggml-metal/ggml-metal.cpp |  1 +
- 3 files changed, 69 insertions(+), 6 deletions(-)
+ 3 files changed, 63 insertions(+), 6 deletions(-)
 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index 393c329be..99412fe56 100644
+index a7ebe5dcd..03557bb31 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
-@@ -158,6 +158,8 @@ extern "C" {
+@@ -158,6 +158,7 @@ extern "C" {
         const char * description;
         // device free memory in bytes
         size_t memory_free;
 +        // device UUID
 +        const char * id;
         // device total memory in bytes
         size_t memory_total;
         // device type
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 290d762ad..9b9e053f0 100644
+index 6519af435..c9d3a2b03 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -191,6 +191,51 @@ static int ggml_cuda_parse_id(char devName[]) {
+@@ -189,6 +189,51 @@ static int ggml_cuda_parse_id(char devName[]) {
 }
 #endif // defined(GGML_USE_HIP)
@@ -78,7 +77,7 @@ index 290d762ad..9b9e053f0 100644
 static ggml_cuda_device_info ggml_cuda_init() {
     ggml_cuda_device_info info = {};
-@@ -255,22 +300,29 @@ static ggml_cuda_device_info ggml_cuda_init() {
+@@ -255,22 +300,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
                 info.devices[id].cc += prop.minor * 0x10;
             }
         }
@@ -103,26 +102,21 @@ index 290d762ad..9b9e053f0 100644
         info.devices[id].cc = 100*prop.major + 10*prop.minor;
 -        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s\n",
 -                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
 +#ifdef __CUDA_ARCH_LIST__
 +        if (std::getenv("GGML_CUDA_INIT") != NULL) {
 +            GGML_ASSERT(ggml_cuda_has_arch(info.devices[id].cc) && "ggml was not compiled with support for this arch");
 +        }
 +#endif // defined(__CUDA_ARCH_LIST__)
 +        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
 +                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
 +                        ggml_cuda_parse_uuid(prop, id).c_str());
         std::string device_name(prop.name);
         if (device_name == "NVIDIA GeForce MX450") {
             turing_devices_without_mma.push_back({ id, device_name });
-@@ -4155,6 +4207,7 @@ struct ggml_backend_cuda_device_context {
+@@ -4110,6 +4157,7 @@ struct ggml_backend_cuda_device_context {
     std::string name;
     std::string description;
     std::string pci_bus_id;
 +    std::string id;
     int op_offload_min_batch_size;
 };
-@@ -4244,6 +4297,11 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k
+ static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -4198,6 +4246,11 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k
 }
 #endif // defined(__linux__)
@@ -134,7 +128,7 @@ index 290d762ad..9b9e053f0 100644
 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
     ggml_cuda_set_device(ctx->device);
-@@ -4284,6 +4342,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
+@@ -4238,6 +4291,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
     props->name        = ggml_backend_cuda_device_get_name(dev);
     props->description = ggml_backend_cuda_device_get_description(dev);
@@ -142,7 +136,7 @@ index 290d762ad..9b9e053f0 100644
     props->type        = ggml_backend_cuda_device_get_type(dev);
     props->device_id   = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
     ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
-@@ -4900,6 +4959,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+@@ -4834,6 +4888,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                 cudaDeviceProp prop;
                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
                 dev_ctx->description = prop.name;
@@ -151,7 +145,7 @@ index 290d762ad..9b9e053f0 100644
                 char pci_bus_id[16] = {};
                 snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
 diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
-index 790cabca0..516d74064 100644
+index f2b7fe692..8fc1c2fb5 100644
 --- a/ggml/src/ggml-metal/ggml-metal.cpp
 +++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -547,6 +547,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
--- a/Show More
+++ b/Show More