Revert "Update vendored llama.cpp to b7847" (#14061)

This commit is contained in:
Jeffrey Morgan
2026-02-03 18:39:36 -08:00
committed by GitHub
parent a6355329bf
commit b1fccabb34
240 changed files with 5050 additions and 21247 deletions

View File

@@ -92,7 +92,7 @@ jobs:
flags: '' flags: ''
- os: windows - os: windows
arch: amd64 arch: amd64
preset: 'CUDA 13 Windows' preset: 'CUDA 13'
install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe
cuda-components: cuda-components:
- '"cudart"' - '"cudart"'

View File

@@ -40,17 +40,7 @@
"name": "CUDA 13", "name": "CUDA 13",
"inherits": [ "CUDA" ], "inherits": [ "CUDA" ],
"cacheVariables": { "cacheVariables": {
"CMAKE_CUDA_ARCHITECTURES": "75-virtual;80-virtual;86-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;103-virtual;110-virtual;120-virtual;121-virtual", "CMAKE_CUDA_ARCHITECTURES": "75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;103-virtual;110-virtual;120-virtual;121-virtual",
"CMAKE_CUDA_FLAGS": "-t 4",
"OLLAMA_RUNNER_DIR": "cuda_v13"
}
},
{
"name": "CUDA 13 Windows",
"inherits": [ "CUDA" ],
"description": "Reduced architecture set for Windows to avoid MSVC template compilation issues",
"cacheVariables": {
"CMAKE_CUDA_ARCHITECTURES": "75-virtual;89-virtual;100-virtual;120-virtual",
"CMAKE_CUDA_FLAGS": "-t 4", "CMAKE_CUDA_FLAGS": "-t 4",
"OLLAMA_RUNNER_DIR": "cuda_v13" "OLLAMA_RUNNER_DIR": "cuda_v13"
} }
@@ -148,11 +138,6 @@
"inherits": [ "CUDA" ], "inherits": [ "CUDA" ],
"configurePreset": "CUDA 13" "configurePreset": "CUDA 13"
}, },
{
"name": "CUDA 13 Windows",
"inherits": [ "CUDA" ],
"configurePreset": "CUDA 13 Windows"
},
{ {
"name": "JetPack 5", "name": "JetPack 5",
"inherits": [ "CUDA" ], "inherits": [ "CUDA" ],

View File

@@ -1,6 +1,6 @@
UPSTREAM=https://github.com/ggml-org/llama.cpp.git UPSTREAM=https://github.com/ggml-org/llama.cpp.git
WORKDIR=llama/vendor WORKDIR=llama/vendor
FETCH_HEAD=a5bb8ba4c50257437630c136210396810741bbf7 FETCH_HEAD=ec98e2002
.PHONY: help .PHONY: help
help: help:

View File

@@ -73,18 +73,13 @@ func manhattanDistance[V float32 | float64](v1, v2 []V) V {
} }
func TestEmbedCosineDistanceCorrelation(t *testing.T) { func TestEmbedCosineDistanceCorrelation(t *testing.T) {
softTimeout, hardTimeout := getTimeouts(t) ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
defer cancel() defer cancel()
client, _, cleanup := InitServerConnection(ctx, t) client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup() defer cleanup()
started := time.Now()
for _, model := range libraryEmbedModels { for _, model := range libraryEmbedModels {
t.Run(model, func(t *testing.T) { t.Run(model, func(t *testing.T) {
if time.Since(started) > softTimeout {
t.Skip("skipping - soft timeout exceeded")
}
testCases := []struct { testCases := []struct {
a string a string
b string b string
@@ -494,19 +489,14 @@ func TestEmbedTruncation(t *testing.T) {
// TestEmbedLargeInput tests that embedding models can handle large inputs that would exceed typical batch sizes. // TestEmbedLargeInput tests that embedding models can handle large inputs that would exceed typical batch sizes.
func TestEmbedLargeInput(t *testing.T) { func TestEmbedLargeInput(t *testing.T) {
softTimeout, hardTimeout := getTimeouts(t) ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
defer cancel() defer cancel()
client, _, cleanup := InitServerConnection(ctx, t) client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup() defer cleanup()
started := time.Now()
for _, model := range libraryEmbedModels { for _, model := range libraryEmbedModels {
model := model model := model
t.Run(model, func(t *testing.T) { t.Run(model, func(t *testing.T) {
if time.Since(started) > softTimeout {
t.Skip("skipping - soft timeout exceeded")
}
mctx, mcancel := context.WithTimeout(ctx, 2*time.Minute) mctx, mcancel := context.WithTimeout(ctx, 2*time.Minute)
defer mcancel() defer mcancel()

View File

@@ -21,10 +21,9 @@ func testPropsMap(m map[string]api.ToolProperty) *api.ToolPropertiesMap {
} }
func TestAPIToolCalling(t *testing.T) { func TestAPIToolCalling(t *testing.T) {
initialTimeout := 90 * time.Second initialTimeout := 60 * time.Second
streamTimeout := 90 * time.Second streamTimeout := 60 * time.Second
softTimeout, hardTimeout := getTimeouts(t) ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
defer cancel() defer cancel()
client, _, cleanup := InitServerConnection(ctx, t) client, _, cleanup := InitServerConnection(ctx, t)
@@ -48,12 +47,8 @@ func TestAPIToolCalling(t *testing.T) {
"granite3.3": 7, "granite3.3": 7,
} }
started := time.Now()
for _, model := range libraryToolsModels { for _, model := range libraryToolsModels {
t.Run(model, func(t *testing.T) { t.Run(model, func(t *testing.T) {
if time.Since(started) > softTimeout {
t.Skip("skipping - soft timeout exceeded")
}
if v, ok := minVRAM[model]; ok { if v, ok := minVRAM[model]; ok {
skipUnderMinVRAM(t, v) skipUnderMinVRAM(t, v)
} }

View File

@@ -14,28 +14,25 @@ make -f Makefile.sync apply-patches
### Updating Base Commit ### Updating Base Commit
To update to a new base commit: **Pin to new base commit**
1. **Update FETCH_HEAD** in `Makefile.sync` to the new commit hash. To change the base commit, update `FETCH_HEAD` in Makefile.sync.
2. **Check for upstreamed patches**: Before applying, review if any patches have been merged upstream. Remove those patches from `./patches/` to avoid conflicts. When updating to a newer base commit, the existing patches may not apply cleanly and require manual merge resolution.
3. **Apply patches**: Start by applying the patches. If any of the patches have conflicts, the `git am` will stop at the first failure.
```shell
make -f Makefile.sync apply-patches
```
4. **Resolve conflicts** (if any): When `git am` fails on a patch: ```shell
- Fix conflicts in `./vendor/` make -f Makefile.sync apply-patches
- Stage the resolved files: `git -C llama/vendor add <file>` ```
- Continue: `git -C llama/vendor am --continue`
- Re-run: `make -f Makefile.sync apply-patches`
- Repeat until all patches are applied.
5. **Regenerate patches and sync**: If there are conflicts, you will see an error message. Resolve the conflicts in `./vendor/`, and continue the patch series with `git am --continue` and rerun `make -f Makefile.sync apply-patches`. Repeat until all patches are successfully applied.
```shell
make -f Makefile.sync format-patches sync Once all patches are applied, commit the changes to the tracking repository.
```
```shell
make -f Makefile.sync format-patches sync
```
### Generating Patches ### Generating Patches

2
llama/build-info.cpp generated vendored
View File

@@ -1,4 +1,4 @@
int LLAMA_BUILD_NUMBER = 0; int LLAMA_BUILD_NUMBER = 0;
char const *LLAMA_COMMIT = "a5bb8ba4c50257437630c136210396810741bbf7"; char const *LLAMA_COMMIT = "ec98e2002";
char const *LLAMA_COMPILER = ""; char const *LLAMA_COMPILER = "";
char const *LLAMA_BUILD_TARGET = ""; char const *LLAMA_BUILD_TARGET = "";

View File

@@ -251,7 +251,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
case GGML_SCHED_PRIO_REALTIME: p = -20; break; case GGML_SCHED_PRIO_REALTIME: p = -20; break;
} }
if (setpriority(PRIO_PROCESS, 0, p) != 0) { if (!setpriority(PRIO_PROCESS, 0, p)) {
LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno); LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
return false; return false;
} }
@@ -1078,15 +1078,12 @@ struct common_init_result::impl {
impl() = default; impl() = default;
~impl() = default; ~impl() = default;
// note: the order in which model, context, etc. are declared matters because their destructors will be called bottom-to-top
llama_model_ptr model; llama_model_ptr model;
llama_context_ptr context; llama_context_ptr context;
std::vector<llama_adapter_lora_ptr> lora; std::vector<llama_adapter_lora_ptr> lora;
std::vector<common_sampler_ptr> samplers; std::vector<common_sampler_ptr> samplers;
std::vector<llama_sampler_seq_config> samplers_seq_config;
}; };
common_init_result::common_init_result(common_params & params) : common_init_result::common_init_result(common_params & params) :
@@ -1095,9 +1092,9 @@ common_init_result::common_init_result(common_params & params) :
auto cparams = common_context_params_to_llama(params); auto cparams = common_context_params_to_llama(params);
if (params.fit_params) { if (params.fit_params) {
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__); LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__);
llama_params_fit(params.model.path.c_str(), &mparams, &cparams, llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx, params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR); params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
} }
@@ -1110,25 +1107,6 @@ common_init_result::common_init_result(common_params & params) :
const llama_vocab * vocab = llama_model_get_vocab(model); const llama_vocab * vocab = llama_model_get_vocab(model);
// load and optionally apply lora adapters (must be loaded before context creation)
for (auto & la : params.lora_adapters) {
llama_adapter_lora_ptr lora;
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
if (lora == nullptr) {
LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str());
pimpl->model.reset(model);
return;
}
char buf[1024];
la.ptr = lora.get();
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
la.task_name = buf;
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
la.prompt_prefix = buf;
pimpl->lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
}
// updates params.sampling // updates params.sampling
// TODO: fix naming // TODO: fix naming
common_init_sampler_from_model(model, params.sampling); common_init_sampler_from_model(model, params.sampling);
@@ -1163,18 +1141,10 @@ common_init_result::common_init_result(common_params & params) :
// params.sampling.dry_penalty_last_n = llama_n_ctx(lctx); // params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
//} //}
// init the backend samplers as part of the context creation
pimpl->samplers.resize(cparams.n_seq_max); pimpl->samplers.resize(cparams.n_seq_max);
pimpl->samplers_seq_config.resize(cparams.n_seq_max);
for (int i = 0; i < (int) cparams.n_seq_max; ++i) { for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
pimpl->samplers[i].reset(common_sampler_init(model, params.sampling)); pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
pimpl->samplers_seq_config[i] = { i, common_sampler_get(pimpl->samplers[i].get()) };
}
if (params.sampling.backend_sampling) {
cparams.samplers = pimpl->samplers_seq_config.data();
cparams.n_samplers = pimpl->samplers_seq_config.size();
} }
llama_context * lctx = llama_init_from_model(model, cparams); llama_context * lctx = llama_init_from_model(model, cparams);
@@ -1198,12 +1168,6 @@ common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
return pimpl->samplers[seq_id].get(); return pimpl->samplers[seq_id].get();
} }
void common_init_result::reset_samplers() {
for (int i = 0; i < (int) pimpl->samplers.size(); ++i) {
llama_sampler_reset(common_sampler_get(pimpl->samplers[i].get()));
}
}
std::vector<llama_adapter_lora_ptr> & common_init_result::lora() { std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
return pimpl->lora; return pimpl->lora;
} }
@@ -1279,6 +1243,24 @@ common_init_result_ptr common_init_from_params(common_params & params) {
} }
} }
// load and optionally apply lora adapters
for (auto & la : params.lora_adapters) {
llama_adapter_lora_ptr lora;
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
if (lora == nullptr) {
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
return res;
}
char buf[1024];
la.ptr = lora.get();
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
la.task_name = buf;
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
la.prompt_prefix = buf;
res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
}
if (!params.lora_init_without_apply) { if (!params.lora_init_without_apply) {
common_set_adapter_lora(lctx, params.lora_adapters); common_set_adapter_lora(lctx, params.lora_adapters);
} }
@@ -1319,9 +1301,6 @@ common_init_result_ptr common_init_from_params(common_params & params) {
llama_synchronize(lctx); llama_synchronize(lctx);
llama_perf_context_reset(lctx); llama_perf_context_reset(lctx);
llama_set_warmup(lctx, false); llama_set_warmup(lctx, false);
// reset samplers to reset RNG state after warmup to the seeded state
res->reset_samplers();
} }
return res; return res;
@@ -1360,12 +1339,14 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
mparams.devices = params.devices.data(); mparams.devices = params.devices.data();
} }
mparams.n_gpu_layers = params.n_gpu_layers; if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers;
}
mparams.main_gpu = params.main_gpu; mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode; mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split; mparams.tensor_split = params.tensor_split;
mparams.use_mmap = params.use_mmap; mparams.use_mmap = params.use_mmap;
mparams.use_direct_io = params.use_direct_io;
mparams.use_mlock = params.use_mlock; mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors; mparams.check_tensors = params.check_tensors;
mparams.use_extra_bufts = !params.no_extra_bufts; mparams.use_extra_bufts = !params.no_extra_bufts;

View File

@@ -57,8 +57,6 @@ extern const char * LLAMA_COMMIT;
extern const char * LLAMA_COMPILER; extern const char * LLAMA_COMPILER;
extern const char * LLAMA_BUILD_TARGET; extern const char * LLAMA_BUILD_TARGET;
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
struct common_control_vector_load_info; struct common_control_vector_load_info;
// //
@@ -82,8 +80,6 @@ int32_t cpu_get_num_math();
// //
enum llama_example { enum llama_example {
LLAMA_EXAMPLE_BATCHED,
LLAMA_EXAMPLE_DEBUG,
LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_COMMON,
LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SPECULATIVE,
LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_COMPLETION,
@@ -121,7 +117,6 @@ enum common_sampler_type {
COMMON_SAMPLER_TYPE_INFILL = 9, COMMON_SAMPLER_TYPE_INFILL = 9,
COMMON_SAMPLER_TYPE_PENALTIES = 10, COMMON_SAMPLER_TYPE_PENALTIES = 10,
COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11, COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
COMMON_SAMPLER_TYPE_ADAPTIVE_P = 12,
}; };
// dimensionality reduction methods, used by cvector-generator // dimensionality reduction methods, used by cvector-generator
@@ -169,34 +164,32 @@ enum common_params_sampling_config : uint64_t {
struct common_params_sampling { struct common_params_sampling {
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
int32_t n_prev = 64; // number of previous tokens to remember int32_t n_prev = 64; // number of previous tokens to remember
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
int32_t top_k = 40; // <= 0 to use vocab size int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled float top_p = 0.95f; // 1.0 = disabled
float min_p = 0.05f; // 0.0 = disabled float min_p = 0.05f; // 0.0 = disabled
float xtc_probability = 0.00f; // 0.0 = disabled float xtc_probability = 0.00f; // 0.0 = disabled
float xtc_threshold = 0.10f; // > 0.5 disables XTC float xtc_threshold = 0.10f; // > 0.5 disables XTC
float typ_p = 1.00f; // typical_p, 1.0 = disabled float typ_p = 1.00f; // typical_p, 1.0 = disabled
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
float dynatemp_range = 0.00f; // 0.0 = disabled float dynatemp_range = 0.00f; // 0.0 = disabled
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat = 1.00f; // 1.0 = disabled float penalty_repeat = 1.00f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled float penalty_freq = 0.00f; // 0.0 = disabled
float penalty_present = 0.00f; // 0.0 = disabled float penalty_present = 0.00f; // 0.0 = disabled
float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition: float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length) float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
float adaptive_target = -1.0f; // select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float adaptive_decay = 0.90f; // EMA decay for adaptation; history ≈ 1/(1-decay) tokens (0.0 - 0.99) float top_n_sigma = -1.00f;// -1.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 float mirostat_tau = 5.00f; // target entropy
float top_n_sigma = -1.00f; // -1.0 = disabled float mirostat_eta = 0.10f; // learning rate
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
bool ignore_eos = false; bool ignore_eos = false;
bool no_perf = false; // disable performance metrics bool no_perf = false; // disable performance metrics
bool timing_per_token = false; bool timing_per_token = false;
uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
@@ -223,8 +216,6 @@ struct common_params_sampling {
std::vector<llama_logit_bias> logit_bias; // logit biases to apply std::vector<llama_logit_bias> logit_bias; // logit biases to apply
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
bool backend_sampling = false;
bool has_logit_bias() const { bool has_logit_bias() const {
return !logit_bias.empty(); return !logit_bias.empty();
} }
@@ -286,7 +277,6 @@ struct common_params_diffusion {
}; };
// reasoning API response format (not to be confused as chat template's reasoning format) // reasoning API response format (not to be confused as chat template's reasoning format)
// only used by server
enum common_reasoning_format { enum common_reasoning_format {
COMMON_REASONING_FORMAT_NONE, COMMON_REASONING_FORMAT_NONE,
COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content` COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content`
@@ -339,14 +329,12 @@ struct common_params {
// offload params // offload params
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
bool fit_params = true; // whether to fit unset model/context parameters to free device memory bool fit_params = true; // whether to fit unset model/context parameters to free device memory
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
// margin per device in bytes for fitting parameters to free memory:
std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
@@ -382,11 +370,6 @@ struct common_params {
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
std::string logits_file = ""; // file for saving *all* logits // NOLINT std::string logits_file = ""; // file for saving *all* logits // NOLINT
// llama-debug specific options
std::string logits_output_dir = "data"; // directory for saving logits output files // NOLINT
bool save_logits = false; // whether to save logits to files // NOLINT
std::vector<std::string> tensor_filter; // filter tensor names for debug output (regex) // NOLINT
std::vector<std::string> in_files; // all input files std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override> kv_overrides; std::vector<llama_model_kv_override> kv_overrides;
@@ -437,8 +420,7 @@ struct common_params {
bool kv_unified = false; // enable unified KV cache bool kv_unified = false; // enable unified KV cache
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool use_mmap = true; // enable mmap to use filesystem cache bool use_mmap = true; // use mmap for faster loads
bool use_direct_io = true; // read from disk without buffering for faster model loading
bool use_mlock = false; // use mlock to keep model in memory bool use_mlock = false; // use mlock to keep model in memory
bool verbose_prompt = false; // print prompt tokens before generation bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation bool display_prompt = true; // print prompt before generation
@@ -482,7 +464,6 @@ struct common_params {
int32_t timeout_write = timeout_read; // http write timeout in seconds int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
bool cache_prompt = true; // whether to enable prompt caching
int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc. int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
@@ -494,8 +475,7 @@ struct common_params {
bool enable_chat_template = true; bool enable_chat_template = true;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
int reasoning_budget = -1; int reasoning_budget = -1;
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time
std::vector<std::string> api_keys; std::vector<std::string> api_keys;
@@ -504,11 +484,8 @@ struct common_params {
std::map<std::string, std::string> default_template_kwargs; std::map<std::string, std::string> default_template_kwargs;
// webui configs
bool webui = true;
std::string webui_config_json;
// "advanced" endpoints are disabled by default for better security // "advanced" endpoints are disabled by default for better security
bool webui = true;
bool endpoint_slots = true; bool endpoint_slots = true;
bool endpoint_props = false; // only control POST requests, not GET bool endpoint_props = false; // only control POST requests, not GET
bool endpoint_metrics = false; bool endpoint_metrics = false;
@@ -708,9 +685,7 @@ struct common_init_result {
llama_model * model(); llama_model * model();
llama_context * context(); llama_context * context();
common_sampler * sampler(llama_seq_id seq_id); common_sampler * sampler(llama_seq_id seq_id);
void reset_samplers();
std::vector<llama_adapter_lora_ptr> & lora(); std::vector<llama_adapter_lora_ptr> & lora();

View File

@@ -104,9 +104,10 @@ struct ring_buffer {
struct common_sampler { struct common_sampler {
common_params_sampling params; common_params_sampling params;
struct llama_sampler * grmr;
struct llama_sampler * chain; struct llama_sampler * chain;
bool grammar;
ring_buffer<llama_token> prev; ring_buffer<llama_token> prev;
std::vector<llama_token_data> cur; std::vector<llama_token_data> cur;
@@ -120,34 +121,17 @@ struct common_sampler {
} }
void set_logits(struct llama_context * ctx, int idx) { void set_logits(struct llama_context * ctx, int idx) {
const float * sampled_probs = llama_get_sampled_probs_ith (ctx, idx); const auto * logits = llama_get_logits_ith(ctx, idx);
const float * sampled_logits = llama_get_sampled_logits_ith (ctx, idx);
const llama_token * sampled_ids = llama_get_sampled_candidates_ith(ctx, idx);
const llama_model * model = llama_get_model(ctx); const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model); const llama_vocab * vocab = llama_model_get_vocab(model);
const int n_vocab = llama_vocab_n_tokens(vocab); const int n_vocab = llama_vocab_n_tokens(vocab);
if (sampled_probs) { cur.resize(n_vocab);
const uint32_t sampled_probs_count = llama_get_sampled_probs_count_ith(ctx, idx);
cur.resize(sampled_probs_count); for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
for (uint32_t i = 0; i < sampled_probs_count; ++i) { cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], sampled_probs[i]};
}
} else if (sampled_logits) {
const uint32_t sampled_logits_count = llama_get_sampled_logits_count_ith(ctx, idx);
cur.resize(sampled_logits_count);
for (uint32_t i = 0; i < sampled_logits_count; i++) {
cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f};
}
} else {
const auto * logits = llama_get_logits_ith(ctx, idx);
GGML_ASSERT(logits != nullptr);
cur.resize(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
}
} }
cur_p = { cur.data(), cur.size(), -1, false }; cur_p = { cur.data(), cur.size(), -1, false };
@@ -167,59 +151,54 @@ std::string common_params_sampling::print() const {
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n" "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
"\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n" "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
"\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n" "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f, adaptive_target = %.3f, adaptive_decay = %.3f", "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
penalty_last_n, penalty_repeat, penalty_freq, penalty_present, penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp, top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
mirostat, mirostat_eta, mirostat_tau, adaptive_target, adaptive_decay); mirostat, mirostat_eta, mirostat_tau);
return std::string(result); return std::string(result);
} }
struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params) { struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
const llama_vocab * vocab = llama_model_get_vocab(model); const llama_vocab * vocab = llama_model_get_vocab(model);
llama_sampler_chain_params lparams = llama_sampler_chain_default_params(); llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
lparams.no_perf = params.no_perf; lparams.no_perf = params.no_perf;
llama_sampler * grmr = nullptr;
llama_sampler * chain = llama_sampler_chain_init(lparams); llama_sampler * chain = llama_sampler_chain_init(lparams);
bool grammar = false;
std::vector<llama_sampler *> samplers; std::vector<llama_sampler *> samplers;
if (params.grammar.compare(0, 11, "%llguidance") == 0) { if (params.grammar.compare(0, 11, "%llguidance") == 0) {
#ifdef LLAMA_USE_LLGUIDANCE #ifdef LLAMA_USE_LLGUIDANCE
grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()); samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()));
grammar = true;
#else #else
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled"); GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
#endif // LLAMA_USE_LLGUIDANCE #endif // LLAMA_USE_LLGUIDANCE
} else { } else {
std::vector<std::string> trigger_patterns; std::vector<std::string> trigger_patterns;
std::vector<std::string> patterns_anywhere;
std::vector<llama_token> trigger_tokens; std::vector<llama_token> trigger_tokens;
for (const auto & trigger : params.grammar_triggers) { for (const auto & trigger : params.grammar_triggers) {
switch (trigger.type) { switch (trigger.type) {
case COMMON_GRAMMAR_TRIGGER_TYPE_WORD: case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
{ {
const auto & word = trigger.value; const auto & word = trigger.value;
trigger_patterns.push_back(regex_escape(word)); patterns_anywhere.push_back(regex_escape(word));
break; break;
} }
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN: case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
{ {
trigger_patterns.push_back(trigger.value); patterns_anywhere.push_back(trigger.value);
break; break;
} }
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL: case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
{ {
const auto & pattern = trigger.value; trigger_patterns.push_back(trigger.value);
std::string anchored = "^$";
if (!pattern.empty()) {
anchored = (pattern.front() != '^' ? "^" : "")
+ pattern
+ (pattern.back() != '$' ? "$" : "");
}
trigger_patterns.push_back(anchored);
break; break;
} }
case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN: case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
@@ -233,6 +212,10 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
} }
} }
if (!patterns_anywhere.empty()) {
trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
}
std::vector<const char *> trigger_patterns_c; std::vector<const char *> trigger_patterns_c;
trigger_patterns_c.reserve(trigger_patterns.size()); trigger_patterns_c.reserve(trigger_patterns.size());
for (const auto & regex : trigger_patterns) { for (const auto & regex : trigger_patterns) {
@@ -241,12 +224,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
if (!params.grammar.empty()) { if (!params.grammar.empty()) {
if (params.grammar_lazy) { if (params.grammar_lazy) {
grmr = llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root", samplers.push_back(
trigger_patterns_c.data(), trigger_patterns_c.size(), llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
trigger_tokens.data(), trigger_tokens.size()); trigger_patterns_c.data(), trigger_patterns_c.size(),
trigger_tokens.data(), trigger_tokens.size()));
} else { } else {
grmr = llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"); samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"));
} }
grammar = true;
} }
} }
@@ -255,9 +241,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
} }
if (params.mirostat == 0) { if (params.mirostat == 0) {
bool use_adaptive_p = false; // see below
for (const auto & cnstr : params.samplers) { for (const auto & cnstr : params.samplers) {
switch (cnstr) { switch (cnstr) {
case COMMON_SAMPLER_TYPE_DRY: case COMMON_SAMPLER_TYPE_DRY:
@@ -267,54 +250,43 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
for (const auto & str : params.dry_sequence_breakers) { for (const auto & str : params.dry_sequence_breakers) {
c_breakers.push_back(str.c_str()); c_breakers.push_back(str.c_str());
} }
samplers.push_back(llama_sampler_init_dry(vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
samplers.push_back(llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
} }
break; break;
case COMMON_SAMPLER_TYPE_TOP_K: case COMMON_SAMPLER_TYPE_TOP_K:
samplers.push_back(llama_sampler_init_top_k(params.top_k)); samplers.push_back(llama_sampler_init_top_k (params.top_k));
break; break;
case COMMON_SAMPLER_TYPE_TOP_P: case COMMON_SAMPLER_TYPE_TOP_P:
samplers.push_back(llama_sampler_init_top_p(params.top_p, params.min_keep)); samplers.push_back(llama_sampler_init_top_p (params.top_p, params.min_keep));
break; break;
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
samplers.push_back(llama_sampler_init_top_n_sigma(params.top_n_sigma)); samplers.push_back(llama_sampler_init_top_n_sigma(params.top_n_sigma));
break; break;
case COMMON_SAMPLER_TYPE_MIN_P: case COMMON_SAMPLER_TYPE_MIN_P:
samplers.push_back(llama_sampler_init_min_p(params.min_p, params.min_keep)); samplers.push_back(llama_sampler_init_min_p (params.min_p, params.min_keep));
break; break;
case COMMON_SAMPLER_TYPE_XTC: case COMMON_SAMPLER_TYPE_XTC:
samplers.push_back(llama_sampler_init_xtc(params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed)); samplers.push_back(llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
break; break;
case COMMON_SAMPLER_TYPE_TYPICAL_P: case COMMON_SAMPLER_TYPE_TYPICAL_P:
samplers.push_back(llama_sampler_init_typical(params.typ_p, params.min_keep)); samplers.push_back(llama_sampler_init_typical (params.typ_p, params.min_keep));
break; break;
case COMMON_SAMPLER_TYPE_TEMPERATURE: case COMMON_SAMPLER_TYPE_TEMPERATURE:
samplers.push_back(llama_sampler_init_temp_ext(params.temp, params.dynatemp_range, params.dynatemp_exponent)); samplers.push_back(llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
break; break;
case COMMON_SAMPLER_TYPE_INFILL: case COMMON_SAMPLER_TYPE_INFILL:
samplers.push_back(llama_sampler_init_infill(vocab)); samplers.push_back(llama_sampler_init_infill (vocab));
break; break;
case COMMON_SAMPLER_TYPE_PENALTIES: case COMMON_SAMPLER_TYPE_PENALTIES:
samplers.push_back(llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); samplers.push_back(llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
break;
case COMMON_SAMPLER_TYPE_ADAPTIVE_P:
// the `adaptive-p` sampler is like `dist` and `mirostat` in that it selects
// a single token, so we will add `dist` at the end of the chain by default,
// unless the user specifically included `adaptive-p`. we set this flag here
// so we know to add the sampler at the very end.
use_adaptive_p = true;
break; break;
default: default:
GGML_ASSERT(false && "unknown sampler type"); GGML_ASSERT(false && "unknown sampler type");
} }
} }
if (use_adaptive_p) {
// only if user explicitly included adaptive-p sampler samplers.push_back(llama_sampler_init_dist(params.seed));
samplers.push_back(llama_sampler_init_adaptive_p(params.adaptive_target, params.adaptive_decay, params.seed));
} else {
// default: sample from distribution
samplers.push_back(llama_sampler_init_dist(params.seed));
}
} else if (params.mirostat == 1) { } else if (params.mirostat == 1) {
samplers.push_back(llama_sampler_init_temp(params.temp)); samplers.push_back(llama_sampler_init_temp(params.temp));
samplers.push_back(llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); samplers.push_back(llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
@@ -329,16 +301,10 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
llama_sampler_chain_add(chain, smpl); llama_sampler_chain_add(chain, smpl);
} }
if (grmr && params.backend_sampling) {
LOG_WRN("%s: backend sampling is not compatible with grammar, disabling\n", __func__);
params.backend_sampling = false;
}
auto * result = new common_sampler { auto * result = new common_sampler {
/* .params = */ params, /* .params = */ params,
/* .grmr = */ grmr,
/* .chain = */ chain, /* .chain = */ chain,
/* .grammar = */ grammar,
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)), /* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
/* .cur = */ {}, /* .cur = */ {},
/* .cur_p = */ {}, /* .cur_p = */ {},
@@ -348,45 +314,47 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
} }
void common_sampler_free(struct common_sampler * gsmpl) { void common_sampler_free(struct common_sampler * gsmpl) {
if (!gsmpl) { if (gsmpl) {
return; llama_sampler_free(gsmpl->chain);
delete gsmpl;
} }
llama_sampler_free(gsmpl->grmr);
llama_sampler_free(gsmpl->chain);
delete gsmpl;
} }
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) { void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
if (!gsmpl) {
return;
}
const auto tm = gsmpl->tm(); const auto tm = gsmpl->tm();
if (gsmpl->grmr && accept_grammar) { if (gsmpl->grammar) {
llama_sampler_accept(gsmpl->grmr, token); const int n_smpl = llama_sampler_chain_n(gsmpl->chain);
}
llama_sampler_accept(gsmpl->chain, token); for (int i = 0; i < n_smpl; i++) {
auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
// the grammar sampler is always the first one
if (i == 0) {
if (accept_grammar) {
llama_sampler_accept(smpl, token);
}
} else {
llama_sampler_accept(smpl, token);
}
}
} else {
llama_sampler_accept(gsmpl->chain, token);
}
gsmpl->prev.push_back(token); gsmpl->prev.push_back(token);
} }
void common_sampler_reset(struct common_sampler * gsmpl) { void common_sampler_reset(struct common_sampler * gsmpl) {
if (!gsmpl) {
return;
}
gsmpl->reset(); gsmpl->reset();
} }
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) { struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
return new common_sampler { return new common_sampler {
/* .params = */ gsmpl->params, /* .params = */ gsmpl->params,
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
/* .chain = */ llama_sampler_clone(gsmpl->chain), /* .chain = */ llama_sampler_clone(gsmpl->chain),
/* .grammar = */ gsmpl->grammar,
/* .prev = */ gsmpl->prev, /* .prev = */ gsmpl->prev,
/* .cur = */ gsmpl->cur, /* .cur = */ gsmpl->cur,
/* .cur_p = */ gsmpl->cur_p, /* .cur_p = */ gsmpl->cur_p,
@@ -439,14 +407,10 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
} }
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) { struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
if (!gsmpl) {
return nullptr;
}
return gsmpl->chain; return gsmpl->chain;
} }
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) { llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
llama_synchronize(ctx); llama_synchronize(ctx);
// start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
@@ -454,61 +418,11 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
llama_token id = LLAMA_TOKEN_NULL; llama_token id = LLAMA_TOKEN_NULL;
auto & grmr = gsmpl->grmr;
auto & chain = gsmpl->chain; auto & chain = gsmpl->chain;
auto & cur_p = gsmpl->cur_p; // initialized by set_logits auto & cur_p = gsmpl->cur_p; // initialized by set_logits
// Check if a backend sampler has already sampled a token in which case we
// return that token id directly.
{
id = llama_get_sampled_token_ith(ctx, idx);
if (id != LLAMA_TOKEN_NULL) {
LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
// TODO: simplify
gsmpl->cur.resize(1);
gsmpl->cur[0] = { id, 0.0f, 1.0f };
cur_p = { gsmpl->cur.data(), gsmpl->cur.size(), 0, true };
return id;
}
}
gsmpl->set_logits(ctx, idx); gsmpl->set_logits(ctx, idx);
if (grammar_first) {
llama_sampler_apply(grmr, &cur_p);
}
llama_sampler_apply(chain, &cur_p);
id = cur_p.data[cur_p.selected].id;
if (grammar_first) {
return id;
}
// check if it the sampled token fits the grammar (grammar-based rejection sampling)
{
llama_token_data single_token_data = { id, 1.0f, 0.0f };
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
llama_sampler_apply(grmr, &single_token_data_array);
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
if (is_valid) {
return id;
}
}
// resampling:
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
gsmpl->set_logits(ctx, idx);
llama_sampler_apply(grmr, &cur_p);
llama_sampler_apply(chain, &cur_p); llama_sampler_apply(chain, &cur_p);
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration"); GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
@@ -518,7 +432,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
return id; return id;
} }
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) { std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1"); GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
std::vector<llama_token> result; std::vector<llama_token> result;
@@ -526,7 +440,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
size_t i = 0; size_t i = 0;
for (; i < draft.size(); i++) { for (; i < draft.size(); i++) {
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first); const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
common_sampler_accept(gsmpl, id, true); common_sampler_accept(gsmpl, id, true);
@@ -538,7 +452,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
} }
if (i == draft.size()) { if (i == draft.size()) {
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first); const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
common_sampler_accept(gsmpl, id, true); common_sampler_accept(gsmpl, id, true);
@@ -548,13 +462,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
return result; return result;
} }
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) { std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
std::vector<int> idxs(draft.size() + 1); std::vector<int> idxs(draft.size() + 1);
for (size_t i = 0; i < idxs.size(); ++i) { for (size_t i = 0; i < idxs.size(); ++i) {
idxs[i] = i; idxs[i] = i;
} }
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first); return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
} }
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) { uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
@@ -639,7 +553,6 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
case COMMON_SAMPLER_TYPE_XTC: return 'x'; case COMMON_SAMPLER_TYPE_XTC: return 'x';
case COMMON_SAMPLER_TYPE_INFILL: return 'i'; case COMMON_SAMPLER_TYPE_INFILL: return 'i';
case COMMON_SAMPLER_TYPE_PENALTIES: return 'e'; case COMMON_SAMPLER_TYPE_PENALTIES: return 'e';
case COMMON_SAMPLER_TYPE_ADAPTIVE_P: return 'a';
default : return '?'; default : return '?';
} }
} }
@@ -656,7 +569,6 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
case COMMON_SAMPLER_TYPE_XTC: return "xtc"; case COMMON_SAMPLER_TYPE_XTC: return "xtc";
case COMMON_SAMPLER_TYPE_INFILL: return "infill"; case COMMON_SAMPLER_TYPE_INFILL: return "infill";
case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties"; case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties";
case COMMON_SAMPLER_TYPE_ADAPTIVE_P: return "adaptive_p";
default : return ""; default : return "";
} }
} }
@@ -673,7 +585,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
{ "xtc", COMMON_SAMPLER_TYPE_XTC }, { "xtc", COMMON_SAMPLER_TYPE_XTC },
{ "infill", COMMON_SAMPLER_TYPE_INFILL }, { "infill", COMMON_SAMPLER_TYPE_INFILL },
{ "penalties", COMMON_SAMPLER_TYPE_PENALTIES }, { "penalties", COMMON_SAMPLER_TYPE_PENALTIES },
{ "adaptive_p", COMMON_SAMPLER_TYPE_ADAPTIVE_P },
}; };
// since samplers names are written multiple ways // since samplers names are written multiple ways
@@ -689,7 +600,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
{ "typ", COMMON_SAMPLER_TYPE_TYPICAL_P }, { "typ", COMMON_SAMPLER_TYPE_TYPICAL_P },
{ "min-p", COMMON_SAMPLER_TYPE_MIN_P }, { "min-p", COMMON_SAMPLER_TYPE_MIN_P },
{ "temp", COMMON_SAMPLER_TYPE_TEMPERATURE }, { "temp", COMMON_SAMPLER_TYPE_TEMPERATURE },
{ "adaptive-p", COMMON_SAMPLER_TYPE_ADAPTIVE_P },
}; };
std::vector<common_sampler_type> samplers; std::vector<common_sampler_type> samplers;
@@ -726,7 +636,6 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES), COMMON_SAMPLER_TYPE_PENALTIES }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES), COMMON_SAMPLER_TYPE_PENALTIES },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_ADAPTIVE_P), COMMON_SAMPLER_TYPE_ADAPTIVE_P },
}; };
std::vector<common_sampler_type> samplers; std::vector<common_sampler_type> samplers;

View File

@@ -36,8 +36,7 @@ struct common_sampler;
// llama_sampler API overloads // llama_sampler API overloads
// note: can mutate params in some cases struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params);
void common_sampler_free(struct common_sampler * gsmpl); void common_sampler_free(struct common_sampler * gsmpl);
@@ -49,7 +48,6 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
// arguments can be nullptr to skip printing // arguments can be nullptr to skip printing
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl); void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
// get the underlying llama_sampler_chain
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl); struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
// extended sampling implementation: // extended sampling implementation:
@@ -59,10 +57,7 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
// - check if the token fits the grammar (if any) // - check if the token fits the grammar (if any)
// - if not: resample by first applying the grammar constraints and then sampling again (slower path) // - if not: resample by first applying the grammar constraints and then sampling again (slower path)
// //
// if grammar_first is true, the grammar is applied before the samplers (slower) llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
//
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
// generalized version of common_sampler_sample // generalized version of common_sampler_sample
// //
@@ -80,10 +75,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
// //
// returns at least 1 token, up to idxs.size() // returns at least 1 token, up to idxs.size()
// //
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false); std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);
// assume idxs == [ 0, 1, 2, ..., draft.size() ] // assume idxs == [ 0, 1, 2, ..., draft.size() ]
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false); std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl); uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);

View File

@@ -21,9 +21,7 @@ struct llama_sampler_deleter {
}; };
struct llama_adapter_lora_deleter { struct llama_adapter_lora_deleter {
void operator()(llama_adapter_lora *) { void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }
// llama_adapter_lora_free is deprecated
}
}; };
typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr; typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;

View File

@@ -286,7 +286,7 @@ extern "C" {
// NULL-terminated list of buffer types to use for tensors that match a pattern // NULL-terminated list of buffer types to use for tensors that match a pattern
const struct llama_model_tensor_buft_override * tensor_buft_overrides; const struct llama_model_tensor_buft_override * tensor_buft_overrides;
int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers int32_t n_gpu_layers; // number of layers to store in VRAM
enum llama_split_mode split_mode; // how to split the model across multiple GPUs enum llama_split_mode split_mode; // how to split the model across multiple GPUs
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
@@ -309,7 +309,6 @@ extern "C" {
// Keep the booleans together to avoid misalignment during copy-by-value. // Keep the booleans together to avoid misalignment during copy-by-value.
bool vocab_only; // only load the vocabulary, no weights bool vocab_only; // only load the vocabulary, no weights
bool use_mmap; // use mmap if possible bool use_mmap; // use mmap if possible
bool use_direct_io; // use direct io, takes precedence over use_mmap
bool use_mlock; // force system to keep model in RAM bool use_mlock; // force system to keep model in RAM
bool check_tensors; // validate model tensor data bool check_tensors; // validate model tensor data
bool use_extra_bufts; // use extra buffer types (used for weight repacking) bool use_extra_bufts; // use extra buffer types (used for weight repacking)
@@ -317,11 +316,6 @@ extern "C" {
bool no_alloc; // only load metadata and simulate memory allocations bool no_alloc; // only load metadata and simulate memory allocations
}; };
struct llama_sampler_seq_config {
llama_seq_id seq_id;
struct llama_sampler * sampler;
};
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
// https://github.com/ggml-org/llama.cpp/pull/7544 // https://github.com/ggml-org/llama.cpp/pull/7544
struct llama_context_params { struct llama_context_params {
@@ -370,12 +364,6 @@ extern "C" {
bool kv_unified; // use a unified buffer across the input sequences when computing the attention bool kv_unified; // use a unified buffer across the input sequences when computing the attention
// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
// ref: https://github.com/ggml-org/llama.cpp/pull/14363 // ref: https://github.com/ggml-org/llama.cpp/pull/14363
// [EXPERIMENTAL]
// backend sampler chain configuration (make sure the caller keeps the sampler chains alive)
// note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
struct llama_sampler_seq_config * samplers;
size_t n_samplers;
}; };
// model quantization parameters // model quantization parameters
@@ -479,24 +467,16 @@ extern "C" {
// Frees all allocated memory // Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx); LLAMA_API void llama_free(struct llama_context * ctx);
enum llama_params_fit_status {
LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
LLAMA_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occured, e.g. because no model could be found at the specified path
};
// fits mparams and cparams to free device memory (assumes system memory is unlimited) // fits mparams and cparams to free device memory (assumes system memory is unlimited)
// - returns true if the parameters could be successfully modified to fit device memory // returns true if the parameters could be successfully modified to fit device memory
// - this function is NOT thread safe because it modifies the global llama logger state // this function is NOT thread safe because it modifies the global llama logger state
// - only parameters that have the same value as in llama_default_model_params are modified LLAMA_API bool llama_params_fit(
// with the exception of the context size which is modified if and only if equal to 0
LLAMA_API enum llama_params_fit_status llama_params_fit(
const char * path_model, const char * path_model,
struct llama_model_params * mparams, struct llama_model_params * mparams,
struct llama_context_params * cparams, struct llama_context_params * cparams,
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
size_t * margins, // margins of memory to leave per device in bytes size_t margin, // margin of memory to leave per device in bytes
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
@@ -537,7 +517,6 @@ extern "C" {
LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model); LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model); LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model); LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model); LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model); LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model); LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
@@ -621,8 +600,6 @@ extern "C" {
// //
// Load a LoRA adapter from file // Load a LoRA adapter from file
// The adapter is valid as long as the associated model is not freed
// All adapters must be loaded before context creation
LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init( LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
struct llama_model * model, struct llama_model * model,
const char * path_lora); const char * path_lora);
@@ -647,8 +624,7 @@ extern "C" {
// Manually free a LoRA adapter // Manually free a LoRA adapter
// NOTE: loaded adapters will be free when the associated model is deleted // NOTE: loaded adapters will be free when the associated model is deleted
LLAMA_API DEPRECATED(void llama_adapter_lora_free(struct llama_adapter_lora * adapter), LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
"adapters are now freed together with the associated model");
// Get the invocation tokens if the current lora is an alora // Get the invocation tokens if the current lora is an alora
LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter); LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
@@ -1007,32 +983,6 @@ extern "C" {
// otherwise: float[n_embd] (1-dimensional) // otherwise: float[n_embd] (1-dimensional)
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id); LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
//
// backend sampling API [EXPERIMENTAL]
// note: use only if the llama_context was created with at least one llama_sampler_seq_config
//
// Get the backend sampled token for the ith token.
// Returns LLAMA_TOKEN_NULL if no token was sampled.
LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
// Get the backend sampled probabilites for the ith token
// The index matches llama_get_sampled_token_ith().
// Returns NULL if no probabilites were generated.
LLAMA_API float * llama_get_sampled_probs_ith (struct llama_context * ctx, int32_t i);
LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
// Get the backend sampled logits for the ith token
// Returns NULL if no logits were sampled.
LLAMA_API float * llama_get_sampled_logits_ith (struct llama_context * ctx, int32_t i);
LLAMA_API uint32_t llama_get_sampled_logits_count_ith(struct llama_context * ctx, int32_t i);
// Get the backend sampled candidates (token ids) for the ith token
// These are needed to map probability/logit indices to vocab token ids.
// Returns NULL if no candidates were sampled.
LLAMA_API llama_token * llama_get_sampled_candidates_ith (struct llama_context * ctx, int32_t i);
LLAMA_API uint32_t llama_get_sampled_candidates_count_ith(struct llama_context * ctx, int32_t i);
// //
// Vocab // Vocab
// //
@@ -1204,16 +1154,11 @@ extern "C" {
// //
// llama_sampler_free(smpl); // llama_sampler_free(smpl);
// //
// TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
//
typedef void * llama_sampler_context_t; typedef void * llama_sampler_context_t;
struct llama_sampler_data {
struct ggml_tensor * logits;
struct ggml_tensor * probs;
struct ggml_tensor * sampled;
struct ggml_tensor * candidates;
};
// user code can implement the interface below in order to create custom llama_sampler // user code can implement the interface below in order to create custom llama_sampler
struct llama_sampler_i { struct llama_sampler_i {
const char * (*name) (const struct llama_sampler * smpl); // can be NULL const char * (*name) (const struct llama_sampler * smpl); // can be NULL
@@ -1223,44 +1168,17 @@ extern "C" {
struct llama_sampler * (*clone) (const struct llama_sampler * smpl); // can be NULL if ctx is NULL struct llama_sampler * (*clone) (const struct llama_sampler * smpl); // can be NULL if ctx is NULL
void (*free) ( struct llama_sampler * smpl); // can be NULL if ctx is NULL void (*free) ( struct llama_sampler * smpl); // can be NULL if ctx is NULL
// [EXPERIMENTAL] // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
// backend sampling interface: //void (*apply_ggml) (struct llama_sampler * smpl, ...);
// return true if the backend supports all ops needed by the sampler
// note: call once per sampler
bool (*backend_init)(struct llama_sampler * smpl, ggml_backend_buffer_type_t buft);
// call after .backend_apply()
void (*backend_accept)(
struct llama_sampler * smpl,
struct ggml_context * ctx,
struct ggml_cgraph * gf,
struct ggml_tensor * selected_token);
// call after .backend_init()
void (*backend_apply)(
struct llama_sampler * smpl,
struct ggml_context * ctx,
struct ggml_cgraph * gf,
struct llama_sampler_data * data);
// called before graph execution to set inputs for the current ubatch
void (*backend_set_input)(struct llama_sampler * smpl);
}; };
struct llama_sampler { struct llama_sampler {
struct llama_sampler_i * iface; const struct llama_sampler_i * iface;
llama_sampler_context_t ctx;
llama_sampler_context_t ctx;
}; };
// [EXPERIMENTAL]
// attach a sampler to the context
// note: prefer initializing the context with llama_context_params.samplers when possible
LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
// mirror of llama_sampler_i: // mirror of llama_sampler_i:
LLAMA_API struct llama_sampler * llama_sampler_init ( struct llama_sampler_i * iface, llama_sampler_context_t ctx); LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl); LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token); LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p); LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);
@@ -1276,15 +1194,7 @@ extern "C" {
// important: takes ownership of the sampler object and will free it when llama_sampler_free is called // important: takes ownership of the sampler object and will free it when llama_sampler_free is called
LLAMA_API void llama_sampler_chain_add( struct llama_sampler * chain, struct llama_sampler * smpl); LLAMA_API void llama_sampler_chain_add( struct llama_sampler * chain, struct llama_sampler * smpl);
LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
// return NULL if:
// - the sampler is NULL
// - the sampler is not a llama_sampler_chain
// - the index is out of bounds, unless i == -1
// - if i == -1, returns the chain itself (can be used to check if the sampler is a chain)
LLAMA_API struct llama_sampler * llama_sampler_chain_get( struct llama_sampler * chain, int32_t i);
// the total number of samplers in the chain
LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain); LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain);
// after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
@@ -1293,9 +1203,7 @@ extern "C" {
// available samplers: // available samplers:
LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void); LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
/// seed == LLAMA_DEFAULT_SEED to use a random seed.
LLAMA_API struct llama_sampler * llama_sampler_init_dist(uint32_t seed);
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
/// Setting k <= 0 makes this a noop /// Setting k <= 0 makes this a noop
@@ -1396,33 +1304,6 @@ extern "C" {
const char ** seq_breakers, const char ** seq_breakers,
size_t num_breakers); size_t num_breakers);
/// adaptive-p: select tokens near a configurable target probability over time.
///
/// the adaptive-p sampler transforms the token probability distribution to favor tokens
/// that fall near a user-configurable probability target.
///
/// internally, the sampler maintains an exponential moving average of the *ORIGINAL*
/// probabilities of selected tokens at each sampling step. it uses this EMA to compute an
/// adapted target probability at each sampling step, thus maintaining the desired target
/// probability over time.
///
/// adaptive-p selects a token ID rather than just mutating candidates, so it must be last
/// in the sampler chain (like mirostat, dist, greedy).
///
/// only mild truncation before this sampler is recommended. we suggest applying min-p
/// before adaptive-p as the only other active sampler in the chain.
///
/// @param target select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
/// @param decay EMA decay for adaptation; history ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99)
/// @param seed RNG seed
///
/// ref: https://github.com/ggml-org/llama.cpp/pull/17927
///
LLAMA_API struct llama_sampler * llama_sampler_init_adaptive_p(
float target,
float decay,
uint32_t seed);
LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias( LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
int32_t n_vocab, int32_t n_vocab,
int32_t n_logit_bias, int32_t n_logit_bias,
@@ -1476,12 +1357,12 @@ extern "C" {
/// @details Build a split GGUF final path for this chunk. /// @details Build a split GGUF final path for this chunk.
/// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf" /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
// Returns the split_path length. // Returns the split_path length.
LLAMA_API int32_t llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int32_t split_no, int32_t split_count); LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
/// @details Extract the path prefix from the split_path if and only if the split_no and split_count match. /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
/// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0" /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
// Returns the split_prefix length. // Returns the split_prefix length.
LLAMA_API int32_t llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int32_t split_no, int32_t split_count); LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
// Print system information // Print system information
LLAMA_API const char * llama_print_system_info(void); LLAMA_API const char * llama_print_system_info(void);

View File

@@ -411,9 +411,6 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
} }
} }
// register adapter with model
model.loras.insert(&adapter);
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2); LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
} }
@@ -471,8 +468,8 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
return snprintf(buf, buf_size, "%s", it->second.c_str()); return snprintf(buf, buf_size, "%s", it->second.c_str());
} }
void llama_adapter_lora_free(llama_adapter_lora *) { void llama_adapter_lora_free(llama_adapter_lora * adapter) {
// deprecated: adapters are freed by llama_model's destructor delete adapter;
} }
uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) { uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {

View File

@@ -77,10 +77,6 @@ struct llama_adapter_lora {
~llama_adapter_lora() = default; ~llama_adapter_lora() = default;
llama_adapter_lora_weight * get_weight(ggml_tensor * w); llama_adapter_lora_weight * get_weight(ggml_tensor * w);
uint32_t get_n_nodes() const {
return ab_map.size() * 6u; // a, b, scale, add, 2 x mul_mat
}
}; };
using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>; using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;

View File

@@ -20,7 +20,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_STARCODER, "starcoder" }, { LLM_ARCH_STARCODER, "starcoder" },
{ LLM_ARCH_REFACT, "refact" }, { LLM_ARCH_REFACT, "refact" },
{ LLM_ARCH_BERT, "bert" }, { LLM_ARCH_BERT, "bert" },
{ LLM_ARCH_MODERN_BERT, "modern-bert" },
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" }, { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" }, { LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
{ LLM_ARCH_NEO_BERT, "neo-bert" }, { LLM_ARCH_NEO_BERT, "neo-bert" },
@@ -42,7 +41,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_PHIMOE, "phimoe" }, { LLM_ARCH_PHIMOE, "phimoe" },
{ LLM_ARCH_PLAMO, "plamo" }, { LLM_ARCH_PLAMO, "plamo" },
{ LLM_ARCH_PLAMO2, "plamo2" }, { LLM_ARCH_PLAMO2, "plamo2" },
{ LLM_ARCH_PLAMO3, "plamo3" },
{ LLM_ARCH_CODESHELL, "codeshell" }, { LLM_ARCH_CODESHELL, "codeshell" },
{ LLM_ARCH_ORION, "orion" }, { LLM_ARCH_ORION, "orion" },
{ LLM_ARCH_INTERNLM2, "internlm2" }, { LLM_ARCH_INTERNLM2, "internlm2" },
@@ -81,7 +79,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_NEMOTRON_H_MOE, "nemotron_h_moe" }, { LLM_ARCH_NEMOTRON_H_MOE, "nemotron_h_moe" },
{ LLM_ARCH_EXAONE, "exaone" }, { LLM_ARCH_EXAONE, "exaone" },
{ LLM_ARCH_EXAONE4, "exaone4" }, { LLM_ARCH_EXAONE4, "exaone4" },
{ LLM_ARCH_EXAONE_MOE, "exaone-moe" },
{ LLM_ARCH_RWKV6, "rwkv6" }, { LLM_ARCH_RWKV6, "rwkv6" },
{ LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" }, { LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" },
{ LLM_ARCH_RWKV7, "rwkv7" }, { LLM_ARCH_RWKV7, "rwkv7" },
@@ -118,9 +115,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_RND1, "rnd1" }, { LLM_ARCH_RND1, "rnd1" },
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" }, { LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
{ LLM_ARCH_MISTRAL3, "mistral3" }, { LLM_ARCH_MISTRAL3, "mistral3" },
{ LLM_ARCH_MIMO2, "mimo2" },
{ LLM_ARCH_LLAMA_EMBED, "llama-embed" },
{ LLM_ARCH_MAINCODER, "maincoder" },
{ LLM_ARCH_UNKNOWN, "(unknown)" }, { LLM_ARCH_UNKNOWN, "(unknown)" },
}; };
@@ -154,7 +148,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" }, { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" }, { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" }, { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
{ LLM_KV_EMBEDDING_LENGTH_OUT, "%s.embedding_length_out" },
{ LLM_KV_FEATURES_LENGTH, "%s.features_length" }, { LLM_KV_FEATURES_LENGTH, "%s.features_length" },
{ LLM_KV_BLOCK_COUNT, "%s.block_count" }, { LLM_KV_BLOCK_COUNT, "%s.block_count" },
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" }, { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
@@ -212,7 +205,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_GATE_LORA_RANK, "%s.attention.gate_lora_rank" }, { LLM_KV_ATTENTION_GATE_LORA_RANK, "%s.attention.gate_lora_rank" },
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" }, { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, "%s.attention.sliding_window_pattern" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" }, { LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" }, { LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
@@ -224,7 +216,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" }, { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
{ LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" },
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
@@ -509,7 +500,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
case LLM_ARCH_LLAMA: case LLM_ARCH_LLAMA:
case LLM_ARCH_DECI: case LLM_ARCH_DECI:
case LLM_ARCH_MISTRAL3: case LLM_ARCH_MISTRAL3:
case LLM_ARCH_LLAMA_EMBED:
return { return {
LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM, LLM_TENSOR_OUTPUT_NORM,
@@ -791,20 +781,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_CLS, LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT, LLM_TENSOR_CLS_OUT,
}; };
case LLM_ARCH_MODERN_BERT:
return {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_TOKEN_EMBD_NORM,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_ATTN_QKV,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
};
case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_JINA_BERT_V2:
return { return {
LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_TOKEN_EMBD,
@@ -954,8 +930,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_ATTN_K_NORM, LLM_TENSOR_ATTN_K_NORM,
LLM_TENSOR_ATTN_V, LLM_TENSOR_ATTN_V,
LLM_TENSOR_ATTN_OUT, LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_ATTN_QKV,
LLM_TENSOR_ATTN_GATE,
LLM_TENSOR_FFN_NORM, LLM_TENSOR_FFN_NORM,
LLM_TENSOR_FFN_GATE_INP, LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_GATE_EXPS, LLM_TENSOR_FFN_GATE_EXPS,
@@ -1086,22 +1060,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_ATTN_POST_NORM, LLM_TENSOR_ATTN_POST_NORM,
LLM_TENSOR_FFN_POST_NORM, LLM_TENSOR_FFN_POST_NORM,
}; };
case LLM_ARCH_PLAMO3:
return {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_QKV,
LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K_NORM,
LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_ATTN_POST_NORM,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_FFN_POST_NORM,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
};
case LLM_ARCH_CODESHELL: case LLM_ARCH_CODESHELL:
return { return {
LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_TOKEN_EMBD,
@@ -1732,38 +1690,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_FFN_UP, LLM_TENSOR_FFN_UP,
LLM_TENSOR_FFN_POST_NORM, LLM_TENSOR_FFN_POST_NORM,
}; };
case LLM_ARCH_EXAONE_MOE:
return {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_ROPE_FREQS,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_Q,
LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K,
LLM_TENSOR_ATTN_K_NORM,
LLM_TENSOR_ATTN_V,
LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_GATE_EXPS,
LLM_TENSOR_FFN_DOWN_EXPS,
LLM_TENSOR_FFN_UP_EXPS,
LLM_TENSOR_FFN_GATE_SHEXP,
LLM_TENSOR_FFN_UP_SHEXP,
LLM_TENSOR_FFN_DOWN_SHEXP,
LLM_TENSOR_FFN_EXP_PROBS_B,
LLM_TENSOR_NEXTN_EH_PROJ,
LLM_TENSOR_NEXTN_EMBED_TOKENS,
LLM_TENSOR_NEXTN_ENORM,
LLM_TENSOR_NEXTN_HNORM,
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
};
case LLM_ARCH_RWKV6: case LLM_ARCH_RWKV6:
return { return {
LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_TOKEN_EMBD,
@@ -2114,7 +2040,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM_LFM2, LLM_TENSOR_OUTPUT_NORM_LFM2,
LLM_TENSOR_OUTPUT, LLM_TENSOR_OUTPUT,
LLM_TENSOR_DENSE_2_OUT,
}; };
case LLM_ARCH_LFM2MOE: case LLM_ARCH_LFM2MOE:
return { return {
@@ -2133,7 +2058,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_SHORTCONV_INPROJ, LLM_TENSOR_SHORTCONV_INPROJ,
LLM_TENSOR_SHORTCONV_OUTPROJ, LLM_TENSOR_SHORTCONV_OUTPROJ,
LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM_LFM2, LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_FFN_GATE_INP, LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_GATE_EXPS, LLM_TENSOR_FFN_GATE_EXPS,
LLM_TENSOR_FFN_DOWN_EXPS, LLM_TENSOR_FFN_DOWN_EXPS,
@@ -2249,49 +2174,11 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_VISEXP_FFN_DOWN, LLM_TENSOR_VISEXP_FFN_DOWN,
LLM_TENSOR_VISEXP_FFN_UP, LLM_TENSOR_VISEXP_FFN_UP,
}; };
case LLM_ARCH_MIMO2:
return {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_Q,
LLM_TENSOR_ATTN_K,
LLM_TENSOR_ATTN_V,
LLM_TENSOR_ATTN_SINKS,
LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_GATE_EXPS,
LLM_TENSOR_FFN_DOWN_EXPS,
LLM_TENSOR_FFN_UP_EXPS,
LLM_TENSOR_FFN_EXP_PROBS_B,
};
case LLM_ARCH_GPTJ: case LLM_ARCH_GPTJ:
case LLM_ARCH_UNKNOWN: case LLM_ARCH_UNKNOWN:
return { return {
LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_TOKEN_EMBD,
}; };
case LLM_ARCH_MAINCODER:
return {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_Q,
LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K,
LLM_TENSOR_ATTN_K_NORM,
LLM_TENSOR_ATTN_V,
LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
};
case LLM_ARCH_SOLAR: case LLM_ARCH_SOLAR:
return { return {
LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_TOKEN_EMBD,

View File

@@ -24,7 +24,6 @@ enum llm_arch {
LLM_ARCH_STARCODER, LLM_ARCH_STARCODER,
LLM_ARCH_REFACT, LLM_ARCH_REFACT,
LLM_ARCH_BERT, LLM_ARCH_BERT,
LLM_ARCH_MODERN_BERT,
LLM_ARCH_NOMIC_BERT, LLM_ARCH_NOMIC_BERT,
LLM_ARCH_NOMIC_BERT_MOE, LLM_ARCH_NOMIC_BERT_MOE,
LLM_ARCH_NEO_BERT, LLM_ARCH_NEO_BERT,
@@ -46,7 +45,6 @@ enum llm_arch {
LLM_ARCH_PHIMOE, LLM_ARCH_PHIMOE,
LLM_ARCH_PLAMO, LLM_ARCH_PLAMO,
LLM_ARCH_PLAMO2, LLM_ARCH_PLAMO2,
LLM_ARCH_PLAMO3,
LLM_ARCH_CODESHELL, LLM_ARCH_CODESHELL,
LLM_ARCH_ORION, LLM_ARCH_ORION,
LLM_ARCH_INTERNLM2, LLM_ARCH_INTERNLM2,
@@ -85,7 +83,6 @@ enum llm_arch {
LLM_ARCH_NEMOTRON_H_MOE, LLM_ARCH_NEMOTRON_H_MOE,
LLM_ARCH_EXAONE, LLM_ARCH_EXAONE,
LLM_ARCH_EXAONE4, LLM_ARCH_EXAONE4,
LLM_ARCH_EXAONE_MOE,
LLM_ARCH_RWKV6, LLM_ARCH_RWKV6,
LLM_ARCH_RWKV6QWEN2, LLM_ARCH_RWKV6QWEN2,
LLM_ARCH_RWKV7, LLM_ARCH_RWKV7,
@@ -122,9 +119,6 @@ enum llm_arch {
LLM_ARCH_RND1, LLM_ARCH_RND1,
LLM_ARCH_PANGU_EMBED, LLM_ARCH_PANGU_EMBED,
LLM_ARCH_MISTRAL3, LLM_ARCH_MISTRAL3,
LLM_ARCH_MIMO2,
LLM_ARCH_LLAMA_EMBED,
LLM_ARCH_MAINCODER,
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
}; };
@@ -158,7 +152,6 @@ enum llm_kv {
LLM_KV_VOCAB_SIZE, LLM_KV_VOCAB_SIZE,
LLM_KV_CONTEXT_LENGTH, LLM_KV_CONTEXT_LENGTH,
LLM_KV_EMBEDDING_LENGTH, LLM_KV_EMBEDDING_LENGTH,
LLM_KV_EMBEDDING_LENGTH_OUT,
LLM_KV_FEATURES_LENGTH, LLM_KV_FEATURES_LENGTH,
LLM_KV_BLOCK_COUNT, LLM_KV_BLOCK_COUNT,
LLM_KV_LEADING_DENSE_BLOCK_COUNT, LLM_KV_LEADING_DENSE_BLOCK_COUNT,
@@ -216,7 +209,6 @@ enum llm_kv {
LLM_KV_ATTENTION_GATE_LORA_RANK, LLM_KV_ATTENTION_GATE_LORA_RANK,
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,
LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_OUTPUT_SCALE, LLM_KV_ATTENTION_OUTPUT_SCALE,
LLM_KV_ATTENTION_TEMPERATURE_LENGTH, LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
@@ -228,7 +220,6 @@ enum llm_kv {
LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_SECTIONS, LLM_KV_ROPE_DIMENSION_SECTIONS,
LLM_KV_ROPE_FREQ_BASE, LLM_KV_ROPE_FREQ_BASE,
LLM_KV_ROPE_FREQ_BASE_SWA,
LLM_KV_ROPE_SCALE_LINEAR, LLM_KV_ROPE_SCALE_LINEAR,
LLM_KV_ROPE_SCALING_TYPE, LLM_KV_ROPE_SCALING_TYPE,
LLM_KV_ROPE_SCALING_FACTOR, LLM_KV_ROPE_SCALING_FACTOR,

View File

@@ -57,7 +57,6 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM }, { "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 }, { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
{ "exaone4", LLM_CHAT_TEMPLATE_EXAONE_4 }, { "exaone4", LLM_CHAT_TEMPLATE_EXAONE_4 },
{ "exaone-moe", LLM_CHAT_TEMPLATE_EXAONE_MOE },
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD }, { "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
{ "granite", LLM_CHAT_TEMPLATE_GRANITE }, { "granite", LLM_CHAT_TEMPLATE_GRANITE },
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT }, { "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
@@ -75,7 +74,6 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS }, { "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 }, { "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
{ "pangu-embedded", LLM_CHAT_TEMPLATE_PANGU_EMBED }, { "pangu-embedded", LLM_CHAT_TEMPLATE_PANGU_EMBED },
{ "solar-open", LLM_CHAT_TEMPLATE_SOLAR_OPEN },
}; };
llm_chat_template llm_chat_template_from_str(const std::string & name) { llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -138,9 +136,6 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
} else if (tmpl_contains("[gMASK]<sop>")) { } else if (tmpl_contains("[gMASK]<sop>")) {
return LLM_CHAT_TEMPLATE_CHATGLM_4; return LLM_CHAT_TEMPLATE_CHATGLM_4;
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) { } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
if (tmpl_contains("<|tool_declare|>")) {
return LLM_CHAT_TEMPLATE_EXAONE_MOE;
}
return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE; return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
} else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) { } else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
return LLM_CHAT_TEMPLATE_GLMEDGE; return LLM_CHAT_TEMPLATE_GLMEDGE;
@@ -221,8 +216,6 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
return LLM_CHAT_TEMPLATE_GROK_2; return LLM_CHAT_TEMPLATE_GROK_2;
} else if (tmpl_contains(LU8("[unused9]系统:[unused10]"))) { } else if (tmpl_contains(LU8("[unused9]系统:[unused10]"))) {
return LLM_CHAT_TEMPLATE_PANGU_EMBED; return LLM_CHAT_TEMPLATE_PANGU_EMBED;
} else if (tmpl_contains("<|begin|>") && tmpl_contains("<|end|>") && tmpl_contains("<|content|>")) {
return LLM_CHAT_TEMPLATE_SOLAR_OPEN;
} }
return LLM_CHAT_TEMPLATE_UNKNOWN; return LLM_CHAT_TEMPLATE_UNKNOWN;
} }
@@ -580,22 +573,6 @@ int32_t llm_chat_apply_template(
if (add_ass) { if (add_ass) {
ss << "[|assistant|]"; ss << "[|assistant|]";
} }
} else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_MOE) {
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
ss << "<|system|>\n" << trim(message->content) << "<|endofturn|>\n";
} else if (role == "user") {
ss << "<|user|>\n" << trim(message->content) << "<|endofturn|>\n";
} else if (role == "assistant") {
ss << "<|assistant|>\n" << trim(message->content) << "<|endofturn|>\n";
} else if (role == "tool") {
ss << "<|tool|>\n" << trim(message->content) << "<|endofturn|>\n";
}
}
if (add_ass) {
ss << "<|assistant|>\n";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) { } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
// this template requires the model to have "\n\n" as EOT token // this template requires the model to have "\n\n" as EOT token
for (size_t i = 0; i < chat.size(); i++) { for (size_t i = 0; i < chat.size(); i++) {
@@ -868,14 +845,6 @@ int32_t llm_chat_apply_template(
if (add_ass) { if (add_ass) {
ss << "[unused9]助手:"; ss << "[unused9]助手:";
} }
} else if (tmpl == LLM_CHAT_TEMPLATE_SOLAR_OPEN) {
for (auto message : chat) {
std::string role(message->role);
ss << "<|begin|>" << role << "<|content|>" << message->content << "<|end|>";
}
if (add_ass) {
ss << "<|begin|>assistant";
}
} else { } else {
// template not supported // template not supported
return -1; return -1;

View File

@@ -36,7 +36,6 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_MINICPM, LLM_CHAT_TEMPLATE_MINICPM,
LLM_CHAT_TEMPLATE_EXAONE_3, LLM_CHAT_TEMPLATE_EXAONE_3,
LLM_CHAT_TEMPLATE_EXAONE_4, LLM_CHAT_TEMPLATE_EXAONE_4,
LLM_CHAT_TEMPLATE_EXAONE_MOE,
LLM_CHAT_TEMPLATE_RWKV_WORLD, LLM_CHAT_TEMPLATE_RWKV_WORLD,
LLM_CHAT_TEMPLATE_GRANITE, LLM_CHAT_TEMPLATE_GRANITE,
LLM_CHAT_TEMPLATE_GIGACHAT, LLM_CHAT_TEMPLATE_GIGACHAT,
@@ -55,7 +54,6 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_SEED_OSS, LLM_CHAT_TEMPLATE_SEED_OSS,
LLM_CHAT_TEMPLATE_GROK_2, LLM_CHAT_TEMPLATE_GROK_2,
LLM_CHAT_TEMPLATE_PANGU_EMBED, LLM_CHAT_TEMPLATE_PANGU_EMBED,
LLM_CHAT_TEMPLATE_SOLAR_OPEN,
LLM_CHAT_TEMPLATE_UNKNOWN, LLM_CHAT_TEMPLATE_UNKNOWN,
}; };

File diff suppressed because it is too large Load Diff

View File

@@ -40,14 +40,6 @@ struct llama_context {
~llama_context(); ~llama_context();
// reserve a new backend scheduler (if needed)
// for example, when:
// - changing loras
// - changing samplers
// - changing attention type
// - etc.
void sched_reserve();
void synchronize(); void synchronize();
const llama_model & get_model() const; const llama_model & get_model() const;
@@ -78,18 +70,6 @@ struct llama_context {
float * get_embeddings_ith(int32_t i); float * get_embeddings_ith(int32_t i);
float * get_embeddings_seq(llama_seq_id seq_id); float * get_embeddings_seq(llama_seq_id seq_id);
llama_token * get_sampled_tokens() const;
llama_token get_sampled_token_ith(int32_t idx);
float * get_sampled_logits_ith(int32_t idx);
size_t get_sampled_logits_count(int32_t idx);
float * get_sampled_probs_ith(int32_t idx);
size_t get_sampled_probs_count(int32_t idx);
const llama_token * get_sampled_candidates_ith(int32_t idx);
size_t get_sampled_candidates_count(int32_t idx);
void attach_threadpool( void attach_threadpool(
ggml_threadpool_t threadpool, ggml_threadpool_t threadpool,
ggml_threadpool_t threadpool_batch); ggml_threadpool_t threadpool_batch);
@@ -212,13 +192,10 @@ private:
// Make sure enough space is available for outputs. // Make sure enough space is available for outputs.
// Returns max number of outputs for which space was reserved. // Returns max number of outputs for which space was reserved.
uint32_t output_reserve(int32_t n_outputs, const llama_batch & batch); uint32_t output_reserve(int32_t n_outputs);
void output_reorder(); void output_reorder();
// map the output row index `i` to batch index
int64_t output_resolve_row(int32_t i) const;
// //
// graph // graph
// //
@@ -236,8 +213,6 @@ public:
ggml_cgraph * graph_reserve( ggml_cgraph * graph_reserve(
uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr); uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);
bool set_sampler(llama_seq_id seq_id, llama_sampler * sampler);
private: private:
llm_graph_params graph_params( llm_graph_params graph_params(
llm_graph_result * res, llm_graph_result * res,
@@ -277,31 +252,6 @@ private:
size_t embd_size = 0; // capacity (of floats) for embeddings size_t embd_size = 0; // capacity (of floats) for embeddings
float * embd = nullptr; float * embd = nullptr;
// TODO: simplify
struct sampling_info {
std::map<llama_seq_id, llama_sampler *> samplers;
float * logits = nullptr;
size_t logits_size = 0;
llama_token * sampled = nullptr;
size_t sampled_size = 0;
float * probs = nullptr;
size_t probs_size = 0;
llama_token * candidates = nullptr;
size_t candidates_size = 0;
std::vector<uint32_t> logits_count;
std::vector<uint32_t> probs_count;
std::vector<uint32_t> candidates_count;
std::vector<llama_token> token_ids_full_vocab;
};
sampling_info sampling;
// sequence embeddings output (map of [n_embd] vectors) // sequence embeddings output (map of [n_embd] vectors)
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
std::map<llama_seq_id, std::vector<float>> embd_seq; std::map<llama_seq_id, std::vector<float>> embd_seq;
@@ -322,8 +272,6 @@ private:
ggml_backend_sched_ptr sched; ggml_backend_sched_ptr sched;
bool sched_need_reserve = true;
ggml_backend_t backend_cpu = nullptr; ggml_backend_t backend_cpu = nullptr;
std::vector<ggml_backend_ptr> backends; std::vector<ggml_backend_ptr> backends;

View File

@@ -30,12 +30,10 @@ struct llama_cparams {
bool causal_attn; bool causal_attn;
bool offload_kqv; bool offload_kqv;
bool flash_attn; bool flash_attn;
bool auto_fa;
bool no_perf; bool no_perf;
bool warmup; bool warmup;
bool op_offload; bool op_offload;
bool kv_unified; bool kv_unified;
bool pipeline_parallel;
enum llama_pooling_type pooling_type; enum llama_pooling_type pooling_type;

View File

@@ -369,44 +369,6 @@ static void print_rule(
fprintf(file, "\n"); fprintf(file, "\n");
} }
//
// Regex utilities
//
size_t llama_grammar_trigger_pattern::find(const std::string & input) const {
auto find_start_pos = [](const std::smatch & match) {
// get from the first matched capturing group to the end of the string
size_t start = std::string::npos;
for (auto i = 1u; i < match.size(); i++) {
if (match.length(i) > 0) {
start = match.position(i);
break;
}
}
if (start == std::string::npos) {
start = match.position(0);
}
return start;
};
if (!pattern.empty() && pattern.front() == '^' && pattern.back() == '$') {
// match against the entire input
std::smatch match;
if (std::regex_match(input, match, regex)) {
return find_start_pos(match);
}
}
// search anywhere
std::smatch match;
if (std::regex_search(input, match, regex)) {
return find_start_pos(match);
}
return std::string::npos;
}
// //
// implementation // implementation
// //
@@ -1359,10 +1321,21 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
grammar.trigger_buffer_positions.push_back(std::make_pair(token, position)); grammar.trigger_buffer_positions.push_back(std::make_pair(token, position));
grammar.trigger_buffer += piece; grammar.trigger_buffer += piece;
std::smatch match;
for (const auto & trigger_pattern : grammar.trigger_patterns) { for (const auto & trigger_pattern : grammar.trigger_patterns) {
auto start = trigger_pattern.find(grammar.trigger_buffer); if (std::regex_match(grammar.trigger_buffer, match, trigger_pattern.regex)) {
if (start != std::string::npos) {
grammar.awaiting_trigger = false; grammar.awaiting_trigger = false;
// get from the first matched capturing group to the end of the string
size_t start = std::string::npos;
for (auto i = 1u; i < match.size(); i++) {
if (match.length(i) > 0) {
start = match.position(i);
break;
}
}
if (start == std::string::npos) {
start = match.position(0);
}
// replay tokens that overlap with [start, end) // replay tokens that overlap with [start, end)
for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) { for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) {

View File

@@ -130,8 +130,6 @@ struct llama_grammar_parser {
struct llama_grammar_trigger_pattern { struct llama_grammar_trigger_pattern {
std::string pattern; std::string pattern;
std::regex regex; std::regex regex;
size_t find(const std::string & input) const;
}; };
struct llama_grammar { struct llama_grammar {

View File

@@ -7,13 +7,11 @@
#include "llama-kv-cache.h" #include "llama-kv-cache.h"
#include "llama-kv-cache-iswa.h" #include "llama-kv-cache-iswa.h"
#include "llama-memory-hybrid.h" #include "llama-memory-hybrid.h"
#include "llama-memory-hybrid-iswa.h"
#include "llama-memory-recurrent.h" #include "llama-memory-recurrent.h"
#include <cassert> #include <cassert>
#include <cmath> #include <cmath>
#include <cstring> #include <cstring>
#include <unordered_set>
void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) { void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
if (ubatch->token) { if (ubatch->token) {
@@ -23,8 +21,7 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
} }
if (ubatch->embd) { if (ubatch->embd) {
GGML_ASSERT(n_embd == embd->ne[0]); const int64_t n_embd = embd->ne[0];
const int64_t n_tokens = ubatch->n_tokens; const int64_t n_tokens = ubatch->n_tokens;
ggml_backend_tensor_set(embd, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(embd)); ggml_backend_tensor_set(embd, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(embd));
@@ -34,8 +31,8 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) { bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
bool res = true; bool res = true;
res &= (!params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens); res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
res &= (!params.ubatch.embd) || (embd && embd->ne[1] == params.ubatch.n_tokens); res &= (!embd && !params.ubatch.embd) || (embd && embd->ne[0] == params.ubatch.n_tokens);
return res; return res;
} }
@@ -65,7 +62,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) { bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) {
bool res = true; bool res = true;
res &= pos->ne[0] == params.ubatch.n_tokens*n_pos_per_embd; res &= pos->ne[0] == params.ubatch.n_tokens;
return res; return res;
} }
@@ -98,9 +95,11 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
int32_t * data = (int32_t *) pos_bucket->data; int32_t * data = (int32_t *) pos_bucket->data;
for (int j = 0; j < n_tokens; ++j) { for (int h = 0; h < 1; ++h) {
for (int i = 0; i < n_tokens; ++i) { for (int j = 0; j < n_tokens; ++j) {
data[j*n_tokens + i] = llama_relative_position_bucket(ubatch->pos[i], ubatch->pos[j], hparams.n_rel_attn_bkts, true); for (int i = 0; i < n_tokens; ++i) {
data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch->pos[i], ubatch->pos[j], hparams.n_rel_attn_bkts, true);
}
} }
} }
} }
@@ -323,32 +322,34 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
const int64_t n_tokens = ubatch->n_tokens; const int64_t n_tokens = ubatch->n_tokens;
const auto fill_mask = [&](float * data, int n_swa, llama_swa_type swa_type) { const auto fill_mask = [&](float * data, int n_swa, llama_swa_type swa_type) {
for (int i1 = 0; i1 < n_tokens; ++i1) { for (int h = 0; h < 1; ++h) {
const llama_seq_id s1 = ubatch->seq_id[i1][0]; for (int i1 = 0; i1 < n_tokens; ++i1) {
const llama_pos p1 = ubatch->pos[i1]; const llama_seq_id s1 = ubatch->seq_id[i1][0];
const llama_pos p1 = ubatch->pos[i1];
const uint64_t idst = i1*n_kv; const uint64_t idst = h*(n_kv*n_tokens) + i1*n_kv;
for (int i0 = 0; i0 < n_tokens; ++i0) { for (int i0 = 0; i0 < n_tokens; ++i0) {
const llama_seq_id s0 = ubatch->seq_id[i0][0]; const llama_seq_id s0 = ubatch->seq_id[i0][0];
const llama_pos p0 = ubatch->pos[i0]; const llama_pos p0 = ubatch->pos[i0];
// mask different sequences // mask different sequences
if (s0 != s1) { if (s0 != s1) {
continue; continue;
}
// mask future tokens
if (cparams.causal_attn && p0 > p1) {
continue;
}
// apply SWA if any
if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
continue;
}
data[idst + i0] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
} }
// mask future tokens
if (cparams.causal_attn && p0 > p1) {
continue;
}
// apply SWA if any
if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
continue;
}
data[idst + i0] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
} }
} }
}; };
@@ -407,27 +408,6 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
return res; return res;
} }
void llm_graph_input_attn_k::set_input(const llama_ubatch * ubatch) {
mctx->set_input_k_idxs(self_k_idxs, ubatch);
mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
}
bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) {
const auto * mctx = static_cast<const llama_kv_cache_context *>(params.mctx);
this->mctx = mctx;
bool res = true;
res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
res &= self_kq_mask->ne[0] == mctx->get_n_kv();
res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
return res;
}
void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) { void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch); mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch); mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
@@ -473,19 +453,27 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
float * data = (float *) cross_kq_mask->data; float * data = (float *) cross_kq_mask->data;
for (int i = 0; i < n_tokens; ++i) { for (int h = 0; h < 1; ++h) {
for (int j = 0; j < n_enc; ++j) { for (int i = 0; i < n_tokens; ++i) {
float f = -INFINITY; for (int j = 0; j < n_enc; ++j) {
float f = -INFINITY;
for (int s = 0; s < ubatch->n_seq_id[i]; ++s) { for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
const llama_seq_id seq_id = ubatch->seq_id[i][s]; const llama_seq_id seq_id = ubatch->seq_id[i][s];
if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) { if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) {
f = 0.0f; f = 0.0f;
}
} }
}
data[i*n_enc + j] = f; data[h*(n_enc*n_tokens) + i*n_enc + j] = f;
}
}
for (int i = n_tokens; i < n_tokens; ++i) {
for (int j = 0; j < n_enc; ++j) {
data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
}
} }
} }
} }
@@ -533,113 +521,6 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
return res; return res;
} }
void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) {
const auto * attn_ctx = mctx->get_attn();
// base tensors may not be allocated if there are no non-SWA attention layers
if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) {
attn_ctx->get_base()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
attn_ctx->get_base()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
attn_ctx->get_base()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
}
// swa tensors may not be allocated if there are no SWA attention layers
if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) {
attn_ctx->get_swa()->set_input_k_idxs(inp_attn->self_k_idxs_swa, ubatch);
attn_ctx->get_swa()->set_input_v_idxs(inp_attn->self_v_idxs_swa, ubatch);
attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn);
}
const int64_t n_rs = mctx->get_recr()->get_n_rs();
if (inp_rs->s_copy) {
GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
int32_t * data = (int32_t *) inp_rs->s_copy->data;
// assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
for (uint32_t i = 0; i < n_rs; ++i) {
data[i] = mctx->get_recr()->s_copy(i);
}
}
}
bool llm_graph_input_mem_hybrid_iswa::can_reuse(const llm_graph_params & params) {
const auto * mctx = static_cast<const llama_memory_hybrid_iswa_context *>(params.mctx);
this->mctx = mctx;
bool res = true;
const auto * attn_ctx = mctx->get_attn();
// base tensors may not be allocated if there are no non-SWA attention layers
if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) {
res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens;
//res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
res &= inp_attn->self_kq_mask->ne[0] == attn_ctx->get_base()->get_n_kv();
res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens;
}
// swa tensors may not be allocated if there are no SWA attention layers
if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) {
res &= inp_attn->self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
//res &= inp_attn->self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
res &= inp_attn->self_kq_mask_swa->ne[0] == attn_ctx->get_swa()->get_n_kv();
res &= inp_attn->self_kq_mask_swa->ne[1] == params.ubatch.n_tokens;
}
res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
res &= inp_rs->head == mctx->get_recr()->get_head();
res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
return res;
}
void llm_graph_input_sampling::set_input(const llama_ubatch * ubatch) {
// set the inputs only for the active samplers in the current ubatch
std::unordered_set<llama_seq_id> active_samplers;
for (uint32_t i = 0; i < ubatch->n_tokens; i++) {
if (ubatch->output[i]) {
llama_seq_id seq_id = ubatch->seq_id[i][0];
active_samplers.insert(seq_id);
}
}
for (auto seq_id : active_samplers) {
if (samplers.find(seq_id) == samplers.end()) {
continue;
}
auto & sampler = samplers[seq_id];
if (sampler->iface->backend_set_input) {
sampler->iface->backend_set_input(sampler);
}
}
}
bool llm_graph_input_sampling::can_reuse(const llm_graph_params & params) {
if (samplers.size() != params.samplers.size()) {
return false;
}
for (const auto & [seq_id, sampler] : params.samplers) {
if (samplers[seq_id] != sampler) {
return false;
}
}
return true;
}
// //
// llm_graph_result // llm_graph_result
// //
@@ -656,15 +537,10 @@ int64_t llm_graph_result::get_max_nodes() const {
} }
void llm_graph_result::reset() { void llm_graph_result::reset() {
t_inp_tokens = nullptr; t_tokens = nullptr;
t_inp_embd = nullptr;
t_logits = nullptr; t_logits = nullptr;
t_embd = nullptr; t_embd = nullptr;
t_embd_pooled = nullptr; t_embd_pooled = nullptr;
t_sampled.clear();
t_sampled_probs.clear();
t_sampled_logits.clear();
t_candidates.clear();
params = {}; params = {};
@@ -689,38 +565,6 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
} }
} }
void llm_graph_result::set_outputs() {
if (t_logits != nullptr) {
ggml_set_output(t_logits);
}
if (t_embd != nullptr) {
ggml_set_output(t_embd);
}
if (t_embd_pooled != nullptr) {
ggml_set_output(t_embd_pooled);
}
for (auto & [seq_id, t] : t_sampled) {
if (t != nullptr) {
ggml_set_output(t);
}
}
for (auto & [seq_id, t] : t_sampled_probs) {
if (t != nullptr) {
ggml_set_output(t);
}
}
for (auto & [seq_id, t] : t_sampled_logits) {
if (t != nullptr) {
ggml_set_output(t);
}
}
for (auto & [seq_id, t] : t_candidates) {
if (t != nullptr) {
ggml_set_output(t);
}
}
}
bool llm_graph_result::can_reuse(const llm_graph_params & params) { bool llm_graph_result::can_reuse(const llm_graph_params & params) {
if (!this->params.allow_reuse(params)) { if (!this->params.allow_reuse(params)) {
if (debug > 1) { if (debug > 1) {
@@ -802,7 +646,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
loras (params.loras), loras (params.loras),
mctx (params.mctx), mctx (params.mctx),
cross (params.cross), cross (params.cross),
samplers (params.samplers),
cb_func (params.cb), cb_func (params.cb),
res (params.res), res (params.res),
ctx0 (res->get_ctx()), ctx0 (res->get_ctx()),
@@ -1361,29 +1204,17 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
// input embeddings with optional lora // input embeddings with optional lora
ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
const int64_t n_embd_inp = hparams.n_embd_inp(); const int64_t n_embd = hparams.n_embd_inp();
const int64_t n_embd = hparams.n_embd;
assert(n_embd_inp >= n_embd); auto inp = std::make_unique<llm_graph_input_embd>();
auto inp = std::make_unique<llm_graph_input_embd>(n_embd_inp); ggml_tensor * cur = nullptr;
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); if (ubatch.token) {
cb(inp->tokens, "inp_tokens", -1); inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
ggml_set_input(inp->tokens); //cb(inp->tokens, "inp_tokens", -1);
res->t_inp_tokens = inp->tokens; ggml_set_input(inp->tokens);
res->t_tokens = inp->tokens;
inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_inp, ubatch.n_tokens);
cb(inp->embd, "inp_embd", -1);
ggml_set_input(inp->embd);
// select one of the 2 inputs, based on the batch contents
// ref: https://github.com/ggml-org/llama.cpp/pull/18550
std::array<ggml_tensor *, 2> inps;
// token embeddings path (ubatch.token != nullptr)
{
auto & cur = inps[0];
cur = ggml_get_rows(ctx0, tok_embd, inp->tokens); cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);
@@ -1404,43 +1235,22 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
cur = ggml_add(ctx0, cur, inpL_delta); cur = ggml_add(ctx0, cur, inpL_delta);
} }
} else {
if (n_embd_inp != n_embd) { inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
cur = ggml_pad(ctx0, cur, hparams.n_embd_inp() - n_embd, 0, 0, 0); ggml_set_input(inp->embd);
}
}
// vector embeddings path (ubatch.embd != nullptr)
{
auto & cur = inps[1];
cur = inp->embd; cur = inp->embd;
} }
assert(ggml_are_same_shape (inps[0], inps[1]));
assert(ggml_are_same_stride(inps[0], inps[1]));
ggml_tensor * cur = ggml_build_forward_select(gf, inps.data(), inps.size(), ubatch.token ? 0 : 1);
if (n_embd_inp != n_embd) {
cur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0);
}
res->t_inp_embd = cur;
// For Granite architecture // For Granite architecture
if (hparams.f_embedding_scale != 0.0f) { if (hparams.f_embedding_scale != 0.0f) {
cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale); cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale);
} }
cb(cur, "embd", -1); cb(cur, "inp_embd", -1);
res->add_input(std::move(inp)); res->add_input(std::move(inp));
// make sure the produced embeddings are immediately materialized in the ggml graph
// ref: https://github.com/ggml-org/llama.cpp/pull/18599
ggml_build_forward_expand(gf, cur);
return cur; return cur;
} }
@@ -1532,7 +1342,7 @@ ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
//} //}
const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd_inp(); const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd_inp();
const auto n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train; const auto n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc); cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
ggml_set_input(cur); ggml_set_input(cur);
@@ -1630,11 +1440,6 @@ ggml_tensor * llm_graph_context::build_attn_mha(
hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
cb(cur, LLAMA_TENSOR_NAME_FATTN, il); cb(cur, LLAMA_TENSOR_NAME_FATTN, il);
if (!cparams.offload_kqv) {
// all nodes between the KV store and the attention output are run on the CPU
ggml_backend_sched_set_tensor_backend(sched, cur, backend_cpu);
}
ggml_flash_attn_ext_add_sinks(cur, sinks); ggml_flash_attn_ext_add_sinks(cur, sinks);
ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32); ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);
@@ -1844,11 +1649,9 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * v_cur, ggml_tensor * v_cur,
ggml_tensor * kq_b, ggml_tensor * kq_b,
ggml_tensor * sinks, ggml_tensor * sinks,
ggml_tensor * v_mla, // TODO: remove ggml_tensor * v_mla,
float kq_scale, float kq_scale,
int il) const { int il) const {
GGML_ASSERT(v_mla == nullptr);
// these nodes are added to the graph together so that they are not reordered // these nodes are added to the graph together so that they are not reordered
// by doing so, the number of splits in the graph is reduced // by doing so, the number of splits in the graph is reduced
// expand k later to enable rope fusion which directly writes into k-v cache // expand k later to enable rope fusion which directly writes into k-v cache
@@ -1891,93 +1694,6 @@ ggml_tensor * llm_graph_context::build_attn(
return cur; return cur;
} }
static std::unique_ptr<llm_graph_input_attn_k> build_attn_inp_k_impl(
ggml_context * ctx0,
const llama_ubatch & ubatch,
const llama_hparams & hparams,
const llama_cparams & cparams,
const llama_kv_cache_context * mctx_cur) {
auto inp = std::make_unique<llm_graph_input_attn_k>(hparams, cparams, mctx_cur);
{
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_iswa for SWA");
const auto n_kv = mctx_cur->get_n_kv();
const auto n_tokens = ubatch.n_tokens;
const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
ggml_set_input(inp->self_kq_mask);
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
}
return inp;
}
llm_graph_input_attn_k * llm_graph_context::build_attn_inp_k() const {
const auto * mctx_cur = static_cast<const llama_kv_cache_context *>(mctx);
auto inp = build_attn_inp_k_impl(ctx0, ubatch, hparams, cparams, mctx_cur);
return (llm_graph_input_attn_k *) res->add_input(std::move(inp));
}
ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_k * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
ggml_tensor * kq_b,
ggml_tensor * sinks,
ggml_tensor * v_mla,
float kq_scale,
int il) const {
// these nodes are added to the graph together so that they are not reordered
// by doing so, the number of splits in the graph is reduced
// expand k later to enable rope fusion which directly writes into k-v cache
ggml_build_forward_expand(gf, q_cur);
ggml_build_forward_expand(gf, v_cur);
ggml_build_forward_expand(gf, k_cur);
const auto * mctx_cur = inp->mctx;
// store to KV cache
{
const auto & k_idxs = inp->get_k_idxs();
ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, k_idxs, il));
}
const auto & kq_mask = inp->get_kq_mask();
ggml_tensor * q = q_cur;
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
ggml_tensor * v = ggml_view_4d(ctx0, k, v_cur->ne[0], k->ne[1], k->ne[2], k->ne[3], k->nb[1], k->nb[2], k->nb[3], 0);
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
cb(cur, "kqv_out", il);
if (wo) {
cur = build_lora_mm(wo, cur);
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
}
}
if (wo_b) {
cur = ggml_add(ctx0, cur, wo_b);
}
return cur;
}
ggml_tensor * llm_graph_context::build_attn( ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_kv_iswa * inp, llm_graph_input_attn_kv_iswa * inp,
ggml_tensor * wo, ggml_tensor * wo,
@@ -2118,10 +1834,8 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream); inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
ggml_set_input(inp->self_kq_mask); ggml_set_input(inp->self_kq_mask);
ggml_set_name(inp->self_kq_mask, "self_kq_mask");
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
ggml_set_name(inp->self_kq_mask_cnv, "self_kq_mask_cnv");
} }
{ {
@@ -2134,10 +1848,8 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream); inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
ggml_set_input(inp->self_kq_mask_swa); ggml_set_input(inp->self_kq_mask_swa);
ggml_set_name(inp->self_kq_mask_swa, "self_kq_mask_swa");
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa; inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
ggml_set_name(inp->self_kq_mask_swa_cnv, "self_kq_mask_swa_cnv");
} }
return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp)); return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
@@ -2273,62 +1985,17 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp)); return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
} }
llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa() const {
const auto * mctx_cur = static_cast<const llama_memory_hybrid_iswa_context *>(mctx);
auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
// build iswa attention input
const auto * attn_ctx = mctx_cur->get_attn();
auto inp_attn = std::make_unique<llm_graph_input_attn_kv_iswa>(hparams, cparams, attn_ctx);
const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
{
const auto n_kv = attn_ctx->get_base()->get_n_kv();
inp_attn->self_k_idxs = attn_ctx->get_base()->build_input_k_idxs(ctx0, ubatch);
inp_attn->self_v_idxs = attn_ctx->get_base()->build_input_v_idxs(ctx0, ubatch);
inp_attn->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
ggml_set_input(inp_attn->self_kq_mask);
inp_attn->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask, GGML_TYPE_F16) : inp_attn->self_kq_mask;
}
{
const auto n_kv = attn_ctx->get_swa()->get_n_kv();
inp_attn->self_k_idxs_swa = attn_ctx->get_swa()->build_input_k_idxs(ctx0, ubatch);
inp_attn->self_v_idxs_swa = attn_ctx->get_swa()->build_input_v_idxs(ctx0, ubatch);
inp_attn->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
ggml_set_input(inp_attn->self_kq_mask_swa);
inp_attn->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask_swa, GGML_TYPE_F16) : inp_attn->self_kq_mask_swa;
}
auto inp = std::make_unique<llm_graph_input_mem_hybrid_iswa>(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur);
return (llm_graph_input_mem_hybrid_iswa *) res->add_input(std::move(inp));
}
void llm_graph_context::build_dense_out( void llm_graph_context::build_dense_out(
ggml_tensor * dense_2, ggml_tensor * dense_2,
ggml_tensor * dense_3) const { ggml_tensor * dense_3) const {
if (!cparams.embeddings || !(dense_2 || dense_3)) { if (!cparams.embeddings || dense_2 == nullptr || dense_3 == nullptr) {
return; return;
} }
ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd; ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
GGML_ASSERT(cur != nullptr && "missing t_embd_pooled/t_embd"); GGML_ASSERT(cur != nullptr && "missing t_embd_pooled/t_embd");
if (dense_2) { cur = ggml_mul_mat(ctx0, dense_2, cur);
cur = ggml_mul_mat(ctx0, dense_2, cur); cur = ggml_mul_mat(ctx0, dense_3, cur);
}
if (dense_3) {
cur = ggml_mul_mat(ctx0, dense_3, cur);
}
cb(cur, "result_embd_pooled", -1); cb(cur, "result_embd_pooled", -1);
res->t_embd_pooled = cur; res->t_embd_pooled = cur;
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@@ -2419,87 +2086,6 @@ void llm_graph_context::build_pooling(
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
} }
void llm_graph_context::build_sampling() const {
if (samplers.empty() || !res->t_logits) {
return;
}
auto inp_sampling = std::make_unique<llm_graph_input_sampling>(samplers);
res->add_input(std::move(inp_sampling));
std::map<llama_seq_id, int32_t> seq_to_logit_row;
int32_t logit_row_idx = 0;
for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
if (ubatch.output[i]) {
llama_seq_id seq_id = ubatch.seq_id[i][0];
seq_to_logit_row[seq_id] = logit_row_idx;
logit_row_idx++;
}
}
// res->t_logits will contain logits for all tokens that want the logits calculated (logits=1 or output=1)
GGML_ASSERT(res->t_logits != nullptr && "missing t_logits tensor");
// add a dummy row of logits
// this trick makes the graph static, regardless of which samplers are activated
// this is important in order to minimize graph reallocations
// TODO: use `ggml_build_forward_select()` when available (https://github.com/ggml-org/llama.cpp/pull/18550)
ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0);
for (const auto & [seq_id, sampler] : samplers) {
const auto it = seq_to_logit_row.find(seq_id);
// inactive samplers always work on the first row
const auto row_idx = seq_to_logit_row.find(seq_id) != seq_to_logit_row.end() ? it->second : 0;
ggml_tensor * logits_seq = ggml_view_1d(ctx0, logits_t, logits_t->ne[0], row_idx * logits_t->nb[1]);
ggml_format_name(logits_seq, "logits_seq_%d", seq_id);
struct llama_sampler_data data = {
/*.logits =*/ logits_seq,
/*.probs =*/ nullptr,
/*.sampled =*/ nullptr,
/*.candidates =*/ nullptr,
};
assert(sampler->iface->backend_apply);
sampler->iface->backend_apply(sampler, ctx0, gf, &data);
if (data.sampled != nullptr) {
res->t_sampled[seq_id] = data.sampled;
ggml_build_forward_expand(gf, data.sampled);
}
if (data.probs != nullptr) {
res->t_sampled_probs[seq_id] = data.probs;
ggml_build_forward_expand(gf, data.probs);
}
if (data.logits != nullptr) {
res->t_sampled_logits[seq_id] = data.logits;
ggml_build_forward_expand(gf, data.logits);
}
if (data.candidates != nullptr) {
res->t_candidates[seq_id] = data.candidates;
ggml_build_forward_expand(gf, data.candidates);
}
}
// TODO: Call llama_sampler_accept_ggml after all samplers have been applied.
/*
for (const auto & [seq_id, sampler] : samplers) {
if (auto it = res->t_sampled.find(seq_id); it != res->t_sampled.end()) {
ggml_tensor * selected_token = it->second;
if (selected_token != nullptr) {
llama_sampler_accept_ggml(sampler, ctx0, gf, selected_token);
}
}
}
*/
}
int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) { int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
// TODO move to hparams if a T5 variant appears that uses a different value // TODO move to hparams if a T5 variant appears that uses a different value
const int64_t max_distance = 128; const int64_t max_distance = 128;

View File

@@ -10,7 +10,6 @@
#include <memory> #include <memory>
#include <set> #include <set>
#include <functional> #include <functional>
#include <map>
struct ggml_cgraph; struct ggml_cgraph;
struct ggml_context; struct ggml_context;
@@ -24,7 +23,6 @@ class llama_kv_cache_context;
class llama_kv_cache_iswa_context; class llama_kv_cache_iswa_context;
class llama_memory_recurrent_context; class llama_memory_recurrent_context;
class llama_memory_hybrid_context; class llama_memory_hybrid_context;
class llama_memory_hybrid_iswa_context;
// certain models (typically multi-modal) can produce different types of graphs // certain models (typically multi-modal) can produce different types of graphs
enum llm_graph_type { enum llm_graph_type {
@@ -106,7 +104,7 @@ using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
class llm_graph_input_embd : public llm_graph_input_i { class llm_graph_input_embd : public llm_graph_input_i {
public: public:
llm_graph_input_embd(int64_t n_embd) : n_embd(n_embd) {} llm_graph_input_embd() = default;
virtual ~llm_graph_input_embd() = default; virtual ~llm_graph_input_embd() = default;
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
@@ -115,8 +113,6 @@ public:
ggml_tensor * tokens = nullptr; // I32 [n_batch] ggml_tensor * tokens = nullptr; // I32 [n_batch]
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch] ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
const int64_t n_embd = 0;
}; };
class llm_graph_input_pos : public llm_graph_input_i { class llm_graph_input_pos : public llm_graph_input_i {
@@ -317,39 +313,6 @@ public:
const llama_kv_cache_context * mctx; const llama_kv_cache_context * mctx;
}; };
// V-less input for the KV cache
// ref: https://github.com/ggml-org/llama.cpp/pull/19067
class llm_graph_input_attn_k : public llm_graph_input_i {
public:
llm_graph_input_attn_k(
const llama_hparams & hparams,
const llama_cparams & cparams,
const llama_kv_cache_context * mctx) :
hparams(hparams),
cparams(cparams),
mctx(mctx) {
}
~llm_graph_input_attn_k() = default;
void set_input(const llama_ubatch * ubatch) override;
bool can_reuse(const llm_graph_params & params) override;
ggml_tensor * get_k_idxs() const { return self_k_idxs; }
ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
const llama_hparams hparams;
const llama_cparams cparams;
const llama_kv_cache_context * mctx;
};
class llm_graph_input_attn_kv_iswa : public llm_graph_input_i { class llm_graph_input_attn_kv_iswa : public llm_graph_input_i {
public: public:
llm_graph_input_attn_kv_iswa( llm_graph_input_attn_kv_iswa(
@@ -433,46 +396,6 @@ public:
const llama_memory_hybrid_context * mctx; const llama_memory_hybrid_context * mctx;
}; };
class llm_graph_input_mem_hybrid_iswa : public llm_graph_input_i {
public:
llm_graph_input_mem_hybrid_iswa(
const llama_cparams & cparams,
std::unique_ptr<llm_graph_input_attn_kv_iswa> inp_attn,
std::unique_ptr<llm_graph_input_rs> inp_rs,
const llama_memory_hybrid_iswa_context * mctx) :
inp_attn(std::move(inp_attn)),
inp_rs(std::move(inp_rs)),
cparams(cparams),
mctx(mctx) { }
virtual ~llm_graph_input_mem_hybrid_iswa() = default;
void set_input(const llama_ubatch * ubatch) override;
bool can_reuse(const llm_graph_params & params) override;
std::unique_ptr<llm_graph_input_attn_kv_iswa> inp_attn;
std::unique_ptr<llm_graph_input_rs> inp_rs;
llm_graph_input_attn_kv_iswa * get_attn() const { return inp_attn.get(); }
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
const llama_cparams cparams;
const llama_memory_hybrid_iswa_context * mctx;
};
class llm_graph_input_sampling : public llm_graph_input_i {
public:
llm_graph_input_sampling(std::map<llama_seq_id, llama_sampler *> samplers) :
samplers(std::move(samplers)) { }
virtual ~llm_graph_input_sampling() = default;
void set_input(const llama_ubatch * ubatch) override;
bool can_reuse(const llm_graph_params & params) override;
std::map<llama_seq_id, llama_sampler *> samplers;
};
// //
// llm_graph_result // llm_graph_result
// //
@@ -506,23 +429,6 @@ struct llm_graph_params {
const llama_memory_context_i * mctx; const llama_memory_context_i * mctx;
const llama_cross * cross; const llama_cross * cross;
std::map<llama_seq_id, llama_sampler *> samplers;
static bool samplers_equal(
const std::map<llama_seq_id, llama_sampler *> & lhs,
const std::map<llama_seq_id, llama_sampler *> & rhs) {
if (lhs.size() != rhs.size()) {
return false;
}
for (const auto & [seq_id, sampler] : lhs) {
auto it = rhs.find(seq_id);
if (it == rhs.end() || it->second != sampler) {
return false;
}
}
return true;
}
uint32_t n_outputs; uint32_t n_outputs;
llm_graph_cb cb; llm_graph_cb cb;
@@ -562,36 +468,15 @@ struct llm_graph_params {
return false; return false;
} }
if (n_outputs != other.n_outputs) {
return false;
}
if (!samplers_equal(samplers, other.samplers)) {
return false;
}
if (samplers.size() > 0) {
if (!ubatch.data || !other.ubatch.data) {
return false;
}
// check that the outputs are the same for all samplers
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
if (ubatch.output[i] != other.ubatch.output[i] ||
ubatch.seq_id[i][0] != other.ubatch.seq_id[i][0]) {
return false;
}
}
}
return return
cparams.embeddings == other.cparams.embeddings && cparams.embeddings == other.cparams.embeddings &&
cparams.causal_attn == other.cparams.causal_attn && cparams.causal_attn == other.cparams.causal_attn &&
arch == other.arch && arch == other.arch &&
gtype == other.gtype && gtype == other.gtype &&
cvec == other.cvec && cvec == other.cvec &&
loras == other.loras && loras == other.loras &&
cross == other.cross; cross == other.cross &&
n_outputs == other.n_outputs;
} }
}; };
@@ -601,7 +486,7 @@ public:
virtual ~llm_graph_result() = default; virtual ~llm_graph_result() = default;
ggml_tensor * get_inp_tokens() const { return t_inp_tokens; } ggml_tensor * get_tokens() const { return t_tokens; }
ggml_tensor * get_logits() const { return t_logits; } ggml_tensor * get_logits() const { return t_logits; }
ggml_tensor * get_embd() const { return t_embd; } ggml_tensor * get_embd() const { return t_embd; }
ggml_tensor * get_embd_pooled() const { return t_embd_pooled; } ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
@@ -614,7 +499,6 @@ public:
void reset(); void reset();
void set_inputs(const llama_ubatch * ubatch); void set_inputs(const llama_ubatch * ubatch);
void set_outputs();
// try to update the existing graph result using the new graph parameters in order to reuse it // try to update the existing graph result using the new graph parameters in order to reuse it
// this can only be done if we determine that the resulting graph using the new graph parameters // this can only be done if we determine that the resulting graph using the new graph parameters
@@ -628,17 +512,11 @@ public:
void set_params(const llm_graph_params & params); void set_params(const llm_graph_params & params);
// important graph nodes // important graph nodes
ggml_tensor * t_inp_tokens = nullptr; ggml_tensor * t_tokens = nullptr;
ggml_tensor * t_inp_embd = nullptr; // [n_embd_inp, n_tokens]
ggml_tensor * t_logits = nullptr; ggml_tensor * t_logits = nullptr;
ggml_tensor * t_embd = nullptr; ggml_tensor * t_embd = nullptr;
ggml_tensor * t_embd_pooled = nullptr; ggml_tensor * t_embd_pooled = nullptr;
std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
std::map<llama_seq_id, ggml_tensor*> t_candidates;
std::map<llama_seq_id, ggml_tensor*> t_sampled;
std::map<llama_seq_id, ggml_tensor*> t_sampled_probs;
std::vector<llm_graph_input_ptr> inputs; std::vector<llm_graph_input_ptr> inputs;
ggml_context_ptr ctx_compute; ggml_context_ptr ctx_compute;
@@ -714,8 +592,6 @@ struct llm_graph_context {
const llama_memory_context_i * mctx; const llama_memory_context_i * mctx;
const llama_cross * cross; const llama_cross * cross;
std::map<llama_seq_id, llama_sampler *> samplers;
const llm_graph_cb & cb_func; const llm_graph_cb & cb_func;
llm_graph_result * res; llm_graph_result * res;
@@ -866,21 +742,6 @@ struct llm_graph_context {
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
ggml_tensor * kq_b, ggml_tensor * kq_b,
ggml_tensor * sinks, // [n_head_q] ggml_tensor * sinks, // [n_head_q]
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] // TODO: remove
float kq_scale,
int il) const;
llm_graph_input_attn_k * build_attn_inp_k() const;
ggml_tensor * build_attn(
llm_graph_input_attn_k * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
ggml_tensor * kq_b,
ggml_tensor * sinks, // [n_head_q]
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float kq_scale, float kq_scale,
int il) const; int il) const;
@@ -961,8 +822,6 @@ struct llm_graph_context {
llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const; llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;
llm_graph_input_mem_hybrid_iswa * build_inp_mem_hybrid_iswa() const;
// //
// pooling // pooling
// //
@@ -973,12 +832,6 @@ struct llm_graph_context {
ggml_tensor * cls_out, ggml_tensor * cls_out,
ggml_tensor * cls_out_b) const; ggml_tensor * cls_out_b) const;
//
// sampling (backend sampling)
//
void build_sampling() const;
// //
// dense (out) // dense (out)
// //

View File

@@ -72,10 +72,6 @@ uint32_t llama_hparams::n_embd_inp() const {
return n_embd_inp; return n_embd_inp;
} }
uint32_t llama_hparams::n_embd_out() const {
return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd;
}
uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const { uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
const uint32_t n_head_kv = this->n_head_kv(il); const uint32_t n_head_kv = this->n_head_kv(il);
@@ -183,21 +179,6 @@ bool llama_hparams::is_swa(uint32_t il) const {
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
bool llama_hparams::is_mla() const {
assert((n_embd_head_k_mla_impl == 0 && n_embd_head_v_mla_impl == 0) ||
(n_embd_head_k_mla_impl != 0 && n_embd_head_v_mla_impl != 0));
return n_embd_head_k_mla_impl != 0 && n_embd_head_v_mla_impl != 0;
}
uint32_t llama_hparams::n_embd_head_k_mla() const {
return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k;
}
uint32_t llama_hparams::n_embd_head_v_mla() const {
return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v;
}
bool llama_hparams::has_kv(uint32_t il) const { bool llama_hparams::has_kv(uint32_t il) const {
if (n_layer_kv_from_start >= 0) { if (n_layer_kv_from_start >= 0) {
if (il < (uint32_t) n_layer_kv_from_start) { if (il < (uint32_t) n_layer_kv_from_start) {
@@ -223,6 +204,42 @@ uint32_t llama_hparams::n_layer_kv() const {
return res; return res;
} }
bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
assert(p0 >= 0 && p1 >= 0);
switch (swa_type) {
case LLAMA_SWA_TYPE_NONE:
{
} break;
case LLAMA_SWA_TYPE_STANDARD:
{
if (p1 - p0 >= (int32_t) n_swa) {
return true;
}
} break;
case LLAMA_SWA_TYPE_CHUNKED:
{
const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
if (p0 < pos_chunk_start) {
return true;
}
} break;
case LLAMA_SWA_TYPE_SYMMETRIC:
{
const int32_t half_n_swa = (int32_t) n_swa / 2;
const int32_t pos_diff = p1 - p0;
// Mask if outside the symmetric window
if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
return true;
}
} break;
}
return false;
}
bool llama_hparams::use_mrope() const { bool llama_hparams::use_mrope() const {
return rope_sections[0] > 0 && rope_sections[1] > 0; return rope_sections[0] > 0 && rope_sections[1] > 0;
} }

View File

@@ -3,7 +3,6 @@
#include "llama.h" #include "llama.h"
#include <array> #include <array>
#include <cassert>
// bump if necessary // bump if necessary
#define LLAMA_MAX_LAYERS 512 #define LLAMA_MAX_LAYERS 512
@@ -53,8 +52,8 @@ struct llama_hparams {
uint32_t n_rel_attn_bkts = 0; uint32_t n_rel_attn_bkts = 0;
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
uint32_t n_embd_head_k_mla_impl = 0; uint32_t n_embd_head_k_mla = 0;
uint32_t n_embd_head_v_mla_impl = 0; uint32_t n_embd_head_v_mla = 0;
// for WavTokenizer // for WavTokenizer
struct llama_hparams_posnet posnet; struct llama_hparams_posnet posnet;
@@ -108,9 +107,9 @@ struct llama_hparams {
float rope_attn_factor = 1.0f; float rope_attn_factor = 1.0f;
float rope_freq_base_train; float rope_freq_base_train;
float rope_freq_base_train_swa = 10000.0f; float rope_freq_base_train_swa;
float rope_freq_scale_train; float rope_freq_scale_train;
float rope_freq_scale_train_swa = 1.0f; float rope_freq_scale_train_swa;
uint32_t n_ctx_orig_yarn; uint32_t n_ctx_orig_yarn;
float rope_yarn_log_mul = 0.0f; float rope_yarn_log_mul = 0.0f;
@@ -126,11 +125,10 @@ struct llama_hparams {
llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE; llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
// the size of the sliding window (0 - no SWA) // the size of the sliding window (0 - no SWA)
uint32_t n_swa = 0; uint32_t n_swa = 0;
// if swa_layers[il] == 1, then layer il is SWA // if swa_layers[il] == true, then layer il is SWA
// if swa_layers[il] == 0, then layer il is dense (i.e. non-SWA) // if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
// by default, all layers are dense // by default, all layers are dense
// note: using uint32_t type for compatibility reason std::array<bool, LLAMA_MAX_LAYERS> swa_layers;
std::array<uint32_t, LLAMA_MAX_LAYERS> swa_layers;
// for State Space Models // for State Space Models
uint32_t ssm_d_conv = 0; uint32_t ssm_d_conv = 0;
@@ -165,9 +163,6 @@ struct llama_hparams {
// for Classifiers // for Classifiers
uint32_t n_cls_out = 1; uint32_t n_cls_out = 1;
// output embedding dimension (0 = use n_embd)
uint32_t n_embd_out_impl = 0;
// llama4 smallthinker // llama4 smallthinker
uint32_t n_moe_layer_step = 0; uint32_t n_moe_layer_step = 0;
uint32_t n_no_rope_layer_step = 4; uint32_t n_no_rope_layer_step = 4;
@@ -240,9 +235,6 @@ struct llama_hparams {
// dimension of main + auxiliary input embeddings // dimension of main + auxiliary input embeddings
uint32_t n_embd_inp() const; uint32_t n_embd_inp() const;
// dimension of output embeddings
uint32_t n_embd_out() const;
// dimension of key embeddings across all k-v heads // dimension of key embeddings across all k-v heads
uint32_t n_embd_k_gqa(uint32_t il = 0) const; uint32_t n_embd_k_gqa(uint32_t il = 0) const;
@@ -274,57 +266,15 @@ struct llama_hparams {
bool is_swa(uint32_t il) const; bool is_swa(uint32_t il) const;
// note: currently only support if either all or none of the layers are MLA
bool is_mla() const;
uint32_t n_embd_head_k_mla() const;
uint32_t n_embd_head_v_mla() const;
bool has_kv(uint32_t il) const; bool has_kv(uint32_t il) const;
// number of layers for which has_kv() returns true // number of layers for which has_kv() returns true
uint32_t n_layer_kv() const; uint32_t n_layer_kv() const;
// note that this function uses different SWA parameters from those in the hparams // note that this function uses different SWA parameters from those in the hparams
// note: inlined on purpose for performance reasons
// TODO: think of a better place for this function // TODO: think of a better place for this function
// TODO: pack the SWA params in a struct? // TODO: pack the SWA params in a struct?
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) { static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
assert(p0 >= 0 && p1 >= 0);
switch (swa_type) {
case LLAMA_SWA_TYPE_NONE:
{
} break;
case LLAMA_SWA_TYPE_STANDARD:
{
if (p1 - p0 >= (int32_t) n_swa) {
return true;
}
} break;
case LLAMA_SWA_TYPE_CHUNKED:
{
const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
if (p0 < pos_chunk_start) {
return true;
}
} break;
case LLAMA_SWA_TYPE_SYMMETRIC:
{
const int32_t half_n_swa = (int32_t) n_swa / 2;
const int32_t pos_diff = p1 - p0;
// Mask if outside the symmetric window
if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
return true;
}
} break;
}
return false;
}
bool use_mrope() const; bool use_mrope() const;
}; };

View File

@@ -97,8 +97,6 @@ llama_kv_cache::llama_kv_cache(
__func__, hparams.n_embd_v_gqa_max()); __func__, hparams.n_embd_v_gqa_max());
} }
const bool is_mla = hparams.is_mla();
for (uint32_t il = 0; il < hparams.n_layer; il++) { for (uint32_t il = 0; il < hparams.n_layer; il++) {
if (!hparams.has_kv(il)) { if (!hparams.has_kv(il)) {
LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il); LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
@@ -132,21 +130,18 @@ llama_kv_cache::llama_kv_cache(
throw std::runtime_error("failed to create ggml context for kv cache"); throw std::runtime_error("failed to create ggml context for kv cache");
} }
const bool has_k = true; ggml_tensor * k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream);
const bool has_v = !is_mla; ggml_tensor * v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream);
ggml_tensor * k = has_k ? ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream) : nullptr; ggml_format_name(k, "cache_k_l%d", il);
ggml_tensor * v = has_v ? ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream) : nullptr; ggml_format_name(v, "cache_v_l%d", il);
has_k && ggml_format_name(k, "cache_k_l%d", il);
has_v && ggml_format_name(v, "cache_v_l%d", il);
std::vector<ggml_tensor *> k_stream; std::vector<ggml_tensor *> k_stream;
std::vector<ggml_tensor *> v_stream; std::vector<ggml_tensor *> v_stream;
for (uint32_t s = 0; s < n_stream; ++s) { for (uint32_t s = 0; s < n_stream; ++s) {
k_stream.push_back(has_k ? ggml_view_2d(ctx, k, n_embd_k_gqa, kv_size, k->nb[1], s*k->nb[2]) : nullptr); k_stream.push_back(ggml_view_2d(ctx, k, n_embd_k_gqa, kv_size, k->nb[1], s*k->nb[2]));
v_stream.push_back(has_v ? ggml_view_2d(ctx, v, n_embd_v_gqa, kv_size, v->nb[1], s*v->nb[2]) : nullptr); v_stream.push_back(ggml_view_2d(ctx, v, n_embd_v_gqa, kv_size, v->nb[1], s*v->nb[2]));
} }
map_layer_ids[il] = layers.size(); map_layer_ids[il] = layers.size();
@@ -652,10 +647,7 @@ bool llama_kv_cache::update(llama_context * lctx, bool do_shift, const stream_co
const auto & layer = layers[il]; const auto & layer = layers[il];
ggml_backend_tensor_copy(layer.k_stream[ssrc], layer.k_stream[sdst]); ggml_backend_tensor_copy(layer.k_stream[ssrc], layer.k_stream[sdst]);
ggml_backend_tensor_copy(layer.v_stream[ssrc], layer.v_stream[sdst]);
if (layer.v_stream[ssrc]) {
ggml_backend_tensor_copy(layer.v_stream[ssrc], layer.v_stream[sdst]);
}
} }
} }
} }
@@ -860,7 +852,7 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
const llama_seq_id seq_id_cell = cells.seq_get(idx); const llama_seq_id seq_id_cell = cells.seq_get(idx);
// SWA mask // SWA mask
if (llama_hparams::is_masked_swa(n_swa, swa_type, pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) { if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
can_use = true; can_use = true;
} }
} }
@@ -1245,197 +1237,6 @@ void llama_kv_cache::set_input_k_shift(ggml_tensor * dst) const {
} }
} }
struct args_set_input_kq_mask {
const llama_hparams & hparams;
const llama_ubatch * ubatch;
const std::vector<llama_kv_cells> & v_cells;
const std::vector<uint32_t> & seq_to_stream;
uint32_t n_swa;
llama_swa_type swa_type;
int64_t n_kv;
int64_t n_stream;
int64_t n_tps;
};
template<bool causal, bool swa, bool is_2d, bool alibi>
static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
//const auto & hparams = args.hparams;
const auto & ubatch = args.ubatch;
const auto & v_cells = args.v_cells;
const auto & seq_to_stream = args.seq_to_stream;
const uint32_t n_swa = args.n_swa;
const llama_swa_type swa_type = args.swa_type;
const int64_t n_kv = args.n_kv;
const int64_t n_stream = args.n_stream;
const int64_t n_tps = args.n_tps;
// the min position in the batch for each sequence
llama_pos seq_pos_min[LLAMA_MAX_SEQ];
std::fill(seq_pos_min, seq_pos_min + LLAMA_MAX_SEQ, INT32_MAX);
for (uint32_t i = 0; i < ubatch->n_tokens; ++i) {
const llama_seq_id seq_id = ubatch->seq_id[i][0];
seq_pos_min[seq_id] = std::min(seq_pos_min[seq_id], ubatch->pos[i]);
}
for (uint32_t s = 0; s < n_stream; ++s) {
// bookeeping of the KQ mask cells that could change for other tokens of the same sequence
std::unordered_map<llama_seq_id, uint32_t> seq_srct;
std::unordered_map<llama_seq_id, std::vector<uint32_t>> seq_idxs;
for (uint32_t ii = 0; ii < n_tps; ++ii) {
const uint32_t i = s*n_tps + ii;
const llama_seq_id seq_id = ubatch->seq_id[i][0];
const auto & cells = v_cells.at(seq_to_stream[seq_id]);
llama_pos p0 = -1;
const llama_pos p1 = ubatch->pos[i];
// for M-RoPE
const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
const uint64_t idst = n_kv*i;
// for tokens of the same sequence, the mask is mostly the same, so we can reuse it
// the only cells that could change are the ones that are with similar positions as the
// ones in the batch (i.e. due to causal masking, SWA, etc.)
// keep track of those cells and shortcut the loop to save time
// note: this optimization is not compatible with Alibi position encoding
// ref: https://github.com/ggml-org/llama.cpp/pull/18842
bool prev = false;
auto & idxs = seq_idxs[seq_id];
if (!alibi) {
if (seq_srct.find(seq_id) != seq_srct.end()) {
const uint32_t srct = seq_srct[seq_id];
const uint64_t idst_prev = n_kv*srct;
std::copy(data + idst_prev, data + idst_prev + n_kv, data + idst);
prev = true;
} else {
idxs.clear();
idxs.reserve(ubatch->n_tokens + n_swa + 32);
seq_srct[seq_id] = i;
}
}
for (uint32_t jj = 0; jj < n_kv; ++jj) {
uint32_t j = jj;
// we have an exiting mask for this sequence -> update just seq_idxs
if (!alibi) {
if (prev) {
if (jj >= idxs.size()) {
break;
}
j = idxs[jj];
}
}
if (cells.is_empty(j)) {
goto skip;
}
// mask the token if not the same sequence
if (!cells.seq_has(j, seq_id)) {
goto skip;
}
p0 = cells.pos_get(j);
if (!alibi) {
if (!prev) {
// record all cells for which: p0 >= seq_pos_min[seq_id] - n_swa - 32
if (p0 + (int32_t) (n_swa + 32) >= seq_pos_min[seq_id]) {
idxs.push_back(j);
}
}
}
if (causal) {
// mask future tokens
if (p0 > p1) {
goto skip;
}
// M-RoPE causal mask
if (is_2d) {
if (p0 == p1) {
const auto & p0_ext = cells.ext_get(j);
if (p0_ext.is_2d_gt(p1_x, p1_y)) {
goto skip;
}
}
}
}
// apply SWA if any
if (swa) {
if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
goto skip;
}
}
if (alibi) {
data[idst + j] = -std::abs(p0 - p1);
} else {
data[idst + j] = 0.0f;
}
continue;
skip:
data[idst + j] = -INFINITY;
}
}
}
}
template<bool causal, bool swa, bool is_2d>
static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
const bool alibi = args.hparams.use_alibi;
if (alibi) {
set_input_kq_mask_impl<causal, swa, is_2d, true> (args, data);
} else {
set_input_kq_mask_impl<causal, swa, is_2d, false>(args, data);
}
}
template<bool causal, bool swa>
static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
const bool is_2d = args.ubatch->is_pos_2d();
if (is_2d) {
set_input_kq_mask_impl<causal, swa, true> (args, data);
} else {
set_input_kq_mask_impl<causal, swa, false>(args, data);
}
}
template<bool causal>
static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
const bool swa = args.swa_type != LLAMA_SWA_TYPE_NONE;
if (swa) {
set_input_kq_mask_impl<causal, true> (args, data);
} else {
set_input_kq_mask_impl<causal, false>(args, data);
}
}
void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const { void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
const uint32_t n_tokens = ubatch->n_tokens; const uint32_t n_tokens = ubatch->n_tokens;
@@ -1450,29 +1251,74 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
// n_tps == n_tokens_per_stream // n_tps == n_tokens_per_stream
const int64_t n_tps = n_tokens/n_stream; const int64_t n_tps = n_tokens/n_stream;
//const int64_t t_start = ggml_time_us(); std::fill(data, data + ggml_nelements(dst), -INFINITY);
const args_set_input_kq_mask args = { // Use only the previous KV cells of the correct sequence for each token of the ubatch.
/*.hparams =*/ hparams, // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
/*.ubatch =*/ ubatch, // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
/*.v_cells =*/ v_cells, // Causal mask:
/*.seq_to_stream =*/ seq_to_stream, // xxx-------
/*.n_swa =*/ n_swa, // xxxx------
/*.swa_type =*/ swa_type, // xxxxx-----
/*.n_kv =*/ n_kv, // Non-causal mask:
/*.n_stream =*/ n_stream, // xxxxx-----
/*.n_tps =*/ n_tps, // xxxxx-----
}; // xxxxx-----
// To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
// TODO: optimize this section
for (uint32_t h = 0; h < 1; ++h) {
for (uint32_t s = 0; s < n_stream; ++s) {
for (uint32_t ii = 0; ii < n_tps; ++ii) {
const uint32_t i = s*n_tps + ii;
if (causal_attn) { const llama_seq_id seq_id = ubatch->seq_id[i][0];
set_input_kq_mask_impl<true> (args, data);
} else { const auto & cells = v_cells[seq_to_stream[seq_id]];
set_input_kq_mask_impl<false>(args, data);
const llama_pos p1 = ubatch->pos[i];
// for M-RoPE
const bool is_2d = ubatch->is_pos_2d();
const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
const uint64_t idst = n_kv*(h*n_stream*n_tps + s*n_tps + ii);
for (uint32_t j = 0; j < n_kv; ++j) {
if (cells.is_empty(j)) {
continue;
}
// mask the token if not the same sequence
if (!cells.seq_has(j, seq_id)) {
continue;
}
const llama_pos p0 = cells.pos_get(j);
// mask future tokens
if (causal_attn && p0 > p1) {
continue;
}
// M-RoPE causal mask
if (causal_attn && is_2d && p0 == p1) {
const auto & p0_ext = cells.ext_get(j);
if (p0_ext.is_2d_gt(p1_x, p1_y)) {
continue;
}
}
// apply SWA if any
if (is_masked_swa(p0, p1)) {
continue;
}
data[idst + j] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
}
}
}
} }
//const int64_t t_end = ggml_time_us();
//LLAMA_LOG_ERROR("%s: kq mask time: %0.3f ms\n", __func__, (t_end - t_start)/1000.0);
} }
void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const { void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
@@ -1524,7 +1370,7 @@ size_t llama_kv_cache::size_v_bytes() const {
size_t size_v_bytes = 0; size_t size_v_bytes = 0;
for (const auto & layer : layers) { for (const auto & layer : layers) {
size_v_bytes += layer.v ? ggml_nbytes(layer.v) : 0; size_v_bytes += ggml_nbytes(layer.v);
} }
return size_v_bytes; return size_v_bytes;
@@ -1602,10 +1448,6 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
const auto & n_embd_head_k = hparams.n_embd_head_k; const auto & n_embd_head_k = hparams.n_embd_head_k;
//const auto & n_embd_head_v = hparams.n_embd_head_v; //const auto & n_embd_head_v = hparams.n_embd_head_v;
const auto & n_rot = hparams.n_rot;
const auto n_embd_nope = hparams.n_lora_kv > 0 ? n_embd_head_k - n_rot : 0;
auto inp = std::make_unique<llm_graph_input_k_shift>(this); auto inp = std::make_unique<llm_graph_input_k_shift>(this);
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream); inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream);
@@ -1626,10 +1468,10 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
ggml_tensor * k = ggml_tensor * k =
ggml_view_3d(ctx, layer.k, ggml_view_3d(ctx, layer.k,
n_rot, n_head_kv, get_size()*n_stream, n_embd_head_k, n_head_kv, get_size()*n_stream,
ggml_row_size(layer.k->type, n_embd_head_k), ggml_row_size(layer.k->type, n_embd_head_k),
ggml_row_size(layer.k->type, n_embd_k_gqa), ggml_row_size(layer.k->type, n_embd_k_gqa),
ggml_row_size(layer.k->type, n_embd_nope)); 0);
ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l); ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
@@ -1641,6 +1483,10 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
return gf; return gf;
} }
bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const {
return llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1);
}
void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const { void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
GGML_UNUSED(flags); GGML_UNUSED(flags);
@@ -1806,9 +1652,6 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
auto * v = layer.v_stream[cr.strm]; auto * v = layer.v_stream[cr.strm];
if (!v) {
continue;
}
// Write value type // Write value type
const int32_t v_type_i = (int32_t) v->type; const int32_t v_type_i = (int32_t) v->type;
@@ -1835,9 +1678,6 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
auto * v = layer.v_stream[cr.strm]; auto * v = layer.v_stream[cr.strm];
if (!v) {
continue;
}
// Write value type // Write value type
const int32_t v_type_i = (int32_t) v->type; const int32_t v_type_i = (int32_t) v->type;
@@ -2041,9 +1881,6 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
auto * v = layer.v_stream[strm]; auto * v = layer.v_stream[strm];
if (!v) {
continue;
}
// Read type of value // Read type of value
int32_t v_type_i_ref; int32_t v_type_i_ref;
@@ -2085,9 +1922,6 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
auto * v = layer.v_stream[strm]; auto * v = layer.v_stream[strm];
if (!v) {
continue;
}
// Read type of value // Read type of value
int32_t v_type_i_ref; int32_t v_type_i_ref;

View File

@@ -257,6 +257,8 @@ private:
size_t size_k_bytes() const; size_t size_k_bytes() const;
size_t size_v_bytes() const; size_t size_v_bytes() const;
bool is_masked_swa(llama_pos p0, llama_pos p1) const;
ggml_tensor * build_rope_shift( ggml_tensor * build_rope_shift(
const llama_cparams & cparams, const llama_cparams & cparams,
ggml_context * ctx, ggml_context * ctx,
@@ -303,7 +305,7 @@ public:
bool do_shift, bool do_shift,
stream_copy_info sc_info); stream_copy_info sc_info);
// used to create a batch processing context from a batch // used to create a batch procesing context from a batch
llama_kv_cache_context( llama_kv_cache_context(
llama_kv_cache * kv, llama_kv_cache * kv,
slot_info_vec_t sinfos, slot_info_vec_t sinfos,

View File

@@ -1,275 +0,0 @@
#include "llama-memory-hybrid-iswa.h"
#include "llama-impl.h"
#include "llama-model.h"
#include "llama-context.h"
//
// llama_memory_hybrid_iswa
//
llama_memory_hybrid_iswa::llama_memory_hybrid_iswa(
const llama_model & model,
/* attn */
ggml_type type_k,
ggml_type type_v,
bool v_trans,
bool swa_full,
uint32_t kv_size,
uint32_t n_ubatch,
uint32_t n_pad,
/* recurrent */
ggml_type type_r,
ggml_type type_s,
uint32_t rs_size,
/* common */
uint32_t n_seq_max,
bool offload,
bool unified,
/* layer filters */
const layer_filter_cb & filter_attn,
const layer_filter_cb & filter_recr) :
hparams(model.hparams),
mem_attn(new llama_kv_cache_iswa(
model,
type_k,
type_v,
v_trans,
offload,
swa_full,
unified,
kv_size,
n_seq_max,
n_ubatch,
n_pad,
filter_attn == nullptr ?
[&](int32_t il) { return !hparams.is_recurrent(il); }
: filter_attn,
nullptr
)),
mem_recr(new llama_memory_recurrent(
model,
type_r,
type_s,
offload,
rs_size,
n_seq_max,
filter_recr == nullptr ?
[&](int32_t il) { return hparams.is_recurrent(il); }
: filter_recr
)) {}
llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
do {
balloc.split_reset();
// follow the recurrent pattern for creating the ubatch splits
std::vector<llama_ubatch> ubatches;
while (true) {
llama_ubatch ubatch;
if (embd_all) {
// if all tokens are output, split by sequence
ubatch = balloc.split_seq(n_ubatch);
} else {
// TODO: non-sequential equal split can be done if using unified KV cache
// for simplicity, we always use sequential equal split for now
ubatch = balloc.split_equal(n_ubatch, true);
}
if (ubatch.n_tokens == 0) {
break;
}
ubatches.push_back(std::move(ubatch)); // NOLINT
}
if (balloc.get_n_used() < balloc.get_n_tokens()) {
// failed to find a suitable split
break;
}
// prepare the recurrent batches first
if (!mem_recr->prepare(ubatches)) {
// TODO: will the recurrent cache be in an undefined context at this point?
LLAMA_LOG_ERROR("%s: failed to prepare recurrent ubatches\n", __func__);
return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
}
// prepare the attention cache (iswa version returns both base and swa slot infos)
auto sinfos_base = mem_attn->get_base()->prepare(ubatches);
if (sinfos_base.empty()) {
LLAMA_LOG_ERROR("%s: failed to prepare attention base ubatches\n", __func__);
return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
}
auto sinfos_swa = mem_attn->get_swa()->prepare(ubatches);
if (sinfos_swa.empty()) {
LLAMA_LOG_ERROR("%s: failed to prepare attention swa ubatches\n", __func__);
return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
}
return std::make_unique<llama_memory_hybrid_iswa_context>(
this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
} while(false);
return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
}
llama_memory_context_ptr llama_memory_hybrid_iswa::init_full() {
return std::make_unique<llama_memory_hybrid_iswa_context>(this);
}
llama_memory_context_ptr llama_memory_hybrid_iswa::init_update(llama_context * lctx, bool optimize) {
return std::make_unique<llama_memory_hybrid_iswa_context>(this, lctx, optimize);
}
bool llama_memory_hybrid_iswa::get_can_shift() const {
// Shifting is trivially supported for recurrent
return mem_attn->get_can_shift();
}
void llama_memory_hybrid_iswa::clear(bool data) {
mem_attn->clear(data);
mem_recr->clear(data);
}
bool llama_memory_hybrid_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
// Try removing from the recurrent cache first since it may fail. If it does
// fail, the cache will not have been mutated.
if (!mem_recr->seq_rm(seq_id, p0, p1)) {
return false;
}
return mem_attn->seq_rm(seq_id, p0, p1);
}
void llama_memory_hybrid_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
mem_attn->seq_cp(seq_id_src, seq_id_dst, p0, p1);
mem_recr->seq_cp(seq_id_src, seq_id_dst, p0, p1);
}
void llama_memory_hybrid_iswa::seq_keep(llama_seq_id seq_id) {
mem_attn->seq_keep(seq_id);
mem_recr->seq_keep(seq_id);
}
void llama_memory_hybrid_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
mem_attn->seq_add(seq_id, p0, p1, shift);
mem_recr->seq_add(seq_id, p0, p1, shift);
}
void llama_memory_hybrid_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
mem_attn->seq_div(seq_id, p0, p1, d);
mem_recr->seq_div(seq_id, p0, p1, d);
}
llama_pos llama_memory_hybrid_iswa::seq_pos_min(llama_seq_id seq_id) const {
// the min of the total cache is the max of the two caches' min values
return std::max(mem_attn->seq_pos_min(seq_id), mem_recr->seq_pos_min(seq_id));
}
llama_pos llama_memory_hybrid_iswa::seq_pos_max(llama_seq_id seq_id) const {
// the max of the total cache is the min of the two caches' max values
return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
}
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid_iswa::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
for (const auto & buft_size : mem_recr->memory_breakdown()) {
mb[buft_size.first] += buft_size.second;
}
return mb;
}
void llama_memory_hybrid_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
mem_attn->state_write(io, seq_id, flags);
mem_recr->state_write(io, seq_id, flags);
}
void llama_memory_hybrid_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
mem_attn->state_read(io, seq_id, flags);
mem_recr->state_read(io, seq_id, flags);
}
llama_kv_cache_iswa * llama_memory_hybrid_iswa::get_mem_attn() const {
return mem_attn.get();
}
llama_memory_recurrent * llama_memory_hybrid_iswa::get_mem_recr() const {
return mem_recr.get();
}
//
// llama_memory_hybrid_iswa_context
//
llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(llama_memory_status status) : status(status) {}
llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(llama_memory_hybrid_iswa * mem) :
ctx_attn(mem->get_mem_attn()->init_full()),
ctx_recr(mem->get_mem_recr()->init_full()),
status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
}
llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(
llama_memory_hybrid_iswa * mem,
llama_context * lctx,
bool optimize) :
ctx_attn(mem->get_mem_attn()->init_update(lctx, optimize)),
ctx_recr(mem->get_mem_recr()->init_update(lctx, optimize)),
status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
}
llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(
llama_memory_hybrid_iswa * mem,
slot_info_vec_t sinfos_base,
slot_info_vec_t sinfos_swa,
std::vector<llama_ubatch> ubatches) :
ubatches(std::move(ubatches)),
// note: here we copy the ubatches. not sure if this is ideal
ctx_attn(new llama_kv_cache_iswa_context(mem->get_mem_attn(), std::move(sinfos_base), std::move(sinfos_swa), this->ubatches)),
ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
}
bool llama_memory_hybrid_iswa_context::next() {
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
ctx_attn->next();
ctx_recr->next();
if (++i_next >= ubatches.size()) {
return false;
}
return true;
}
bool llama_memory_hybrid_iswa_context::apply() {
assert(!llama_memory_status_is_fail(status));
bool res = true;
res = res & ctx_attn->apply();
res = res & ctx_recr->apply();
return res;
}
llama_memory_status llama_memory_hybrid_iswa_context::get_status() const {
return status;
}
const llama_ubatch & llama_memory_hybrid_iswa_context::get_ubatch() const {
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
return ubatches[i_next];
}
const llama_kv_cache_iswa_context * llama_memory_hybrid_iswa_context::get_attn() const {
return static_cast<const llama_kv_cache_iswa_context *>(ctx_attn.get());
}
const llama_memory_recurrent_context * llama_memory_hybrid_iswa_context::get_recr() const {
return static_cast<const llama_memory_recurrent_context *>(ctx_recr.get());
}

View File

@@ -1,140 +0,0 @@
#pragma once
#include "llama-batch.h"
#include "llama-graph.h"
#include "llama-kv-cache-iswa.h"
#include "llama-memory.h"
#include "llama-memory-recurrent.h"
#include <memory>
#include <vector>
//
// llama_memory_hybrid_iswa
//
// utilizes instances of llama_memory_recurrent and llama_kv_cache_iswa to
// support models where each layer may be either attention-based (with SWA support) or recurrent
class llama_memory_hybrid_iswa : public llama_memory_i {
public:
llama_memory_hybrid_iswa(
const llama_model & model,
/* attn */
ggml_type type_k,
ggml_type type_v,
bool v_trans,
bool swa_full,
uint32_t kv_size,
uint32_t n_ubatch,
uint32_t n_pad,
/* recurrent */
ggml_type type_r,
ggml_type type_s,
uint32_t rs_size,
/* common */
uint32_t n_seq_max,
bool offload,
bool unified,
/* layer filters */
const layer_filter_cb & filter_attn = nullptr,
const layer_filter_cb & filter_recr = nullptr);
~llama_memory_hybrid_iswa() = default;
//
// llama_memory_i
//
llama_memory_context_ptr init_batch(
llama_batch_allocr & balloc,
uint32_t n_ubatch,
bool embd_all) override;
llama_memory_context_ptr init_full() override;
llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;
bool get_can_shift() const override;
void clear(bool data) override;
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
void seq_keep(llama_seq_id seq_id) override;
void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) override;
void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
// state write/load
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
//
// llama_memory_hybrid_iswa specific API
//
llama_kv_cache_iswa * get_mem_attn() const;
llama_memory_recurrent * get_mem_recr() const;
private:
const llama_hparams & hparams;
const std::unique_ptr<llama_kv_cache_iswa> mem_attn;
const std::unique_ptr<llama_memory_recurrent> mem_recr;
};
class llama_memory_hybrid_iswa_context : public llama_memory_context_i {
public:
using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
// init failure
explicit llama_memory_hybrid_iswa_context(llama_memory_status status);
// init full
explicit llama_memory_hybrid_iswa_context(llama_memory_hybrid_iswa * mem);
// init update
explicit llama_memory_hybrid_iswa_context(
llama_memory_hybrid_iswa * mem,
llama_context * lctx,
bool optimize);
// init success
llama_memory_hybrid_iswa_context(
llama_memory_hybrid_iswa * mem,
slot_info_vec_t sinfos_base,
slot_info_vec_t sinfos_swa,
std::vector<llama_ubatch> ubatches);
~llama_memory_hybrid_iswa_context() = default;
bool next() override;
bool apply() override;
llama_memory_status get_status() const override;
const llama_ubatch & get_ubatch() const override;
//
// llama_memory_hybrid_iswa_context
//
const llama_kv_cache_iswa_context * get_attn() const;
const llama_memory_recurrent_context * get_recr() const;
private:
// the index of the next ubatch to process
size_t i_next = 0;
std::vector<llama_ubatch> ubatches;
const llama_memory_context_ptr ctx_attn;
const llama_memory_context_ptr ctx_recr;
const llama_memory_status status;
};

View File

@@ -13,10 +13,9 @@
#ifdef __has_include #ifdef __has_include
#if __has_include(<unistd.h>) #if __has_include(<unistd.h>)
#include <unistd.h> #include <unistd.h>
#include <fcntl.h>
#include <sys/stat.h>
#if defined(_POSIX_MAPPED_FILES) #if defined(_POSIX_MAPPED_FILES)
#include <sys/mman.h> #include <sys/mman.h>
#include <fcntl.h>
#endif #endif
#if defined(_POSIX_MEMLOCK_RANGE) #if defined(_POSIX_MEMLOCK_RANGE)
#include <sys/resource.h> #include <sys/resource.h>
@@ -75,7 +74,7 @@ struct llama_file::impl {
return ret; return ret;
} }
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) { impl(const char * fname, const char * mode) {
fp = ggml_fopen(fname, mode); fp = ggml_fopen(fname, mode);
if (fp == NULL) { if (fp == NULL) {
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
@@ -110,7 +109,7 @@ struct llama_file::impl {
} }
} }
void read_raw(void * ptr, size_t len) { void read_raw(void * ptr, size_t len) const {
size_t bytes_read = 0; size_t bytes_read = 0;
while (bytes_read < len) { while (bytes_read < len) {
size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024); size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
@@ -127,7 +126,7 @@ struct llama_file::impl {
} }
} }
uint32_t read_u32() { uint32_t read_u32() const {
uint32_t val; uint32_t val;
read_raw(&val, sizeof(val)); read_raw(&val, sizeof(val));
return val; return val;
@@ -154,55 +153,16 @@ struct llama_file::impl {
write_raw(&val, sizeof(val)); write_raw(&val, sizeof(val));
} }
bool has_direct_io() const {
return true;
}
~impl() { ~impl() {
if (fp) { if (fp) {
std::fclose(fp); std::fclose(fp);
} }
} }
#else #else
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) : fname(fname) { impl(const char * fname, const char * mode) {
#ifdef __linux__ fp = ggml_fopen(fname, mode);
// Try unbuffered I/O for read only
if (use_direct_io && std::strcmp(mode, "rb") == 0) {
if (init_fd()) {
return;
}
LLAMA_LOG_WARN("Failed to open file '%s' with error: %s. Falling back to buffered I/O",
fname, strerror(errno));
}
#endif
init_fp(mode);
}
#ifdef __linux__
bool init_fd() {
fd = open(fname.c_str(), O_RDONLY | O_DIRECT);
if (fd != -1) {
struct stat file_stats{};
fstat(fd, &file_stats);
size = file_stats.st_size;
alignment = file_stats.st_blksize;
off_t ret = lseek(fd, 0, SEEK_SET);
if (ret == -1) {
throw std::runtime_error(format("seek error: %s", strerror(errno)));
}
return true;
}
return false;
}
#endif
void init_fp(const char * mode) {
fp = ggml_fopen(fname.c_str(), mode);
if (fp == NULL) { if (fp == NULL) {
throw std::runtime_error(format("failed to open %s: %s", fname.c_str(), strerror(errno))); throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
} }
seek(0, SEEK_END); seek(0, SEEK_END);
size = tell(); size = tell();
@@ -210,122 +170,46 @@ struct llama_file::impl {
} }
size_t tell() const { size_t tell() const {
if (fd == -1) { // TODO: this ifdef is never true?
long ret = std::ftell(fp); #ifdef _WIN32
if (ret == -1) { __int64 ret = _ftelli64(fp);
throw std::runtime_error(format("ftell error: %s", strerror(errno))); #else
} long ret = std::ftell(fp);
#endif
return (size_t) ret; if (ret == -1) {
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
} }
off_t pos = lseek(fd, 0, SEEK_CUR); return (size_t) ret;
if (pos == -1) {
throw std::runtime_error(format("lseek error: %s", strerror(errno)));
}
return (size_t) pos;
} }
void seek(size_t offset, int whence) const { void seek(size_t offset, int whence) const {
off_t ret = 0; // TODO: this ifdef is never true?
if (fd == -1) { #ifdef _WIN32
ret = std::fseek(fp, (long) offset, whence); int ret = _fseeki64(fp, (__int64) offset, whence);
} else { #else
ret = lseek(fd, offset, whence); int ret = std::fseek(fp, (long) offset, whence);
} #endif
if (ret == -1) { if (ret != 0) {
throw std::runtime_error(format("seek error: %s", strerror(errno))); throw std::runtime_error(format("seek error: %s", strerror(errno)));
} }
} }
void read_raw_unsafe(void * ptr, size_t len) { void read_raw(void * ptr, size_t len) const {
if (len == 0) { if (len == 0) {
return; return;
} }
errno = 0; errno = 0;
if (fd == -1) { std::size_t ret = std::fread(ptr, len, 1, fp);
const size_t curr_off = tell(); if (ferror(fp)) {
const size_t to_read = std::min(len, size - curr_off); throw std::runtime_error(format("read error: %s", strerror(errno)));
}
std::size_t ret = std::fread(ptr, to_read, 1, fp); if (ret != 1) {
if (ferror(fp)) { throw std::runtime_error("unexpectedly reached end of file");
throw std::runtime_error(format("read error: %s", strerror(errno)));
}
if (to_read > 0 && ret != 1) {
throw std::runtime_error("unexpectedly reached end of file");
}
} else {
size_t bytes_read = 0;
while (bytes_read < len) {
const size_t to_read = len - bytes_read;
ssize_t ret = ::read(fd, reinterpret_cast<char *>(ptr) + bytes_read, to_read);
if (ret == -1) {
if (errno == EINTR) {
continue; // Interrupted by signal, retry
}
// Fallback to std::fread in case the DMA controller cannot access the buffer
if (errno == EFAULT || errno == EINVAL) {
LLAMA_LOG_WARN("%s: Falling back to buffered IO due to %s\n", __func__, strerror(errno));
auto curr_off = tell();
close(fd);
fd = -1;
alignment = 1;
init_fp("rb");
seek(curr_off, SEEK_SET);
read_raw_unsafe(ptr, len);
return;
}
throw std::runtime_error(format("read error: %s", strerror(errno)));
}
if (ret == 0) {
// EOF: allow if this read was only pulling alignment padding past file end
off_t pos = lseek(fd, 0, SEEK_CUR);
if (pos != -1 && (size_t) pos == size) {
std::memset(reinterpret_cast<char *>(ptr) + bytes_read, 0, len - bytes_read);
return;
}
throw std::runtime_error("unexpectedly reached end of file");
}
bytes_read += (size_t) ret;
}
} }
} }
void read_aligned_chunk(void * dest, size_t size) { uint32_t read_u32() const {
size_t offset = tell();
off_t aligned_offset = offset & ~(alignment - 1);
off_t offset_from_alignment = offset - aligned_offset;
size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
void * raw_buffer = nullptr;
int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
if (ret != 0) {
throw std::runtime_error(format("posix_memalign failed with error %d", ret));
}
struct aligned_buffer_deleter {
void operator()(void * p) const { free(p); }
};
std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
seek(aligned_offset, SEEK_SET);
read_raw_unsafe(buffer.get(), bytes_to_read);
uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
memcpy(dest, reinterpret_cast<void *>(actual_data), size);
}
void read_raw(void * ptr, size_t len) {
if (has_direct_io()) {
read_aligned_chunk(ptr, len);
} else {
read_raw_unsafe(ptr, len);
}
}
uint32_t read_u32() {
uint32_t ret; uint32_t ret;
read_raw(&ret, sizeof(ret)); read_raw(&ret, sizeof(ret));
return ret; return ret;
@@ -346,48 +230,27 @@ struct llama_file::impl {
write_raw(&val, sizeof(val)); write_raw(&val, sizeof(val));
} }
bool has_direct_io() const {
return fd != -1 && alignment > 1;
}
~impl() { ~impl() {
if (fd != -1) { if (fp) {
close(fd);
} else {
std::fclose(fp); std::fclose(fp);
} }
} }
int fd = -1;
std::string fname;
#endif #endif
size_t read_alignment() const { FILE * fp;
return alignment; size_t size;
}
size_t alignment = 1;
FILE * fp{};
size_t size{};
}; };
llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) : llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
llama_file::~llama_file() = default; llama_file::~llama_file() = default;
size_t llama_file::tell() const { return pimpl->tell(); } size_t llama_file::tell() const { return pimpl->tell(); }
size_t llama_file::size() const { return pimpl->size; } size_t llama_file::size() const { return pimpl->size; }
size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }
int llama_file::file_id() const { int llama_file::file_id() const {
#ifdef _WIN32 #ifdef _WIN32
return _fileno(pimpl->fp); return _fileno(pimpl->fp);
#else #else
if (pimpl->fd != -1) {
return pimpl->fd;
}
#if defined(fileno) #if defined(fileno)
return fileno(pimpl->fp); return fileno(pimpl->fp);
#else #else
@@ -397,14 +260,9 @@ int llama_file::file_id() const {
} }
void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); } void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
void llama_file::read_raw(void * ptr, size_t len) { pimpl->read_raw(ptr, len); } void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
#ifdef _WIN32
void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
#else
void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw_unsafe(ptr, len); }
#endif
uint32_t llama_file::read_u32() { return pimpl->read_u32(); } uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); } void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); } void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
@@ -618,9 +476,9 @@ struct llama_mlock::impl {
char* errmsg = std::strerror(errno); char* errmsg = std::strerror(errno);
bool suggest = (errno == ENOMEM); bool suggest = (errno == ENOMEM);
#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX) || defined(__HAIKU__) #if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX)
// visionOS/tvOS/Haiku don't support RLIMIT_MEMLOCK // visionOS/tvOS dont't support RLIMIT_MEMLOCK
// Skip resource limit checks on these platforms // Skip resource limit checks on visionOS/tvOS
suggest = false; suggest = false;
#else #else
struct rlimit lock_limit; struct rlimit lock_limit;

View File

@@ -3,7 +3,6 @@
#include <cstdint> #include <cstdint>
#include <memory> #include <memory>
#include <vector> #include <vector>
#include <cstdio>
struct llama_file; struct llama_file;
struct llama_mmap; struct llama_mmap;
@@ -14,7 +13,7 @@ using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>; using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
struct llama_file { struct llama_file {
llama_file(const char * fname, const char * mode, bool use_direct_io = false); llama_file(const char * fname, const char * mode);
~llama_file(); ~llama_file();
size_t tell() const; size_t tell() const;
@@ -24,16 +23,12 @@ struct llama_file {
void seek(size_t offset, int whence) const; void seek(size_t offset, int whence) const;
void read_raw(void * ptr, size_t len); void read_raw(void * ptr, size_t len) const;
void read_raw_unsafe(void * ptr, size_t len); uint32_t read_u32() const;
void read_aligned_chunk(void * dest, size_t size);
uint32_t read_u32();
void write_raw(const void * ptr, size_t len) const; void write_raw(const void * ptr, size_t len) const;
void write_u32(uint32_t val) const; void write_u32(uint32_t val) const;
size_t read_alignment() const;
bool has_direct_io() const;
private: private:
struct impl; struct impl;
std::unique_ptr<impl> pimpl; std::unique_ptr<impl> pimpl;

View File

@@ -2,7 +2,6 @@
#include "ggml.h" #include "ggml.h"
#include <algorithm>
#include <array> #include <array>
#include <cinttypes> #include <cinttypes>
#include <cstring> #include <cstring>
@@ -345,7 +344,6 @@ namespace GGUFMeta {
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid); GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid);
switch (arr_info.gt) { switch (arr_info.gt) {
case GGUF_TYPE_BOOL:
case GGUF_TYPE_UINT32: case GGUF_TYPE_UINT32:
case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) || case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
(std::is_same<T, uint32_t>::value)); break; (std::is_same<T, uint32_t>::value)); break;
@@ -367,13 +365,7 @@ namespace GGUFMeta {
result[i] = value; result[i] = value;
} }
} else { } else {
if (arr_info.gt == GGUF_TYPE_BOOL) { std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
std::transform((const bool *)arr_info.data, (const bool *)arr_info.data + arr_info.length, result.begin(), [](bool x) {
return static_cast<T>(x);
});
} else {
std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
}
} }
return true; return true;
@@ -470,29 +462,6 @@ namespace GGUFMeta {
return get_key_or_arr(llm_kv(kid), result, n, required); return get_key_or_arr(llm_kv(kid), result, n, required);
} }
bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
const std::string key = llm_kv(kid);
const int id = gguf_find_key(meta.get(), key.c_str());
if (id < 0) {
if (required) {
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
}
return false;
}
// throw and error if type is an array
if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) {
if (required) {
throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
}
return false;
}
return get_key(key, result, required);
}
// TODO: this is not very clever - figure out something better // TODO: this is not very clever - figure out something better
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
@@ -503,7 +472,6 @@ llama_model_loader::llama_model_loader(
const std::string & fname, const std::string & fname,
std::vector<std::string> & splits, std::vector<std::string> & splits,
bool use_mmap, bool use_mmap,
bool use_direct_io,
bool check_tensors, bool check_tensors,
bool no_alloc, bool no_alloc,
const llama_model_kv_override * param_overrides_p, const llama_model_kv_override * param_overrides_p,
@@ -536,23 +504,9 @@ llama_model_loader::llama_model_loader(
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
llm_kv = LLM_KV(llm_arch_from_string(arch_name)); llm_kv = LLM_KV(llm_arch_from_string(arch_name));
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io)); files.emplace_back(new llama_file(fname.c_str(), "rb"));
contexts.emplace_back(ctx); contexts.emplace_back(ctx);
if (use_mmap && use_direct_io) {
if (files.back()->has_direct_io()) {
// Disable mmap, as DirectIO is available
use_mmap = false;
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
} else {
// Disable DirectIO and reopen file using std::fopen for mmap
use_direct_io = false;
files.pop_back();
files.emplace_back(new llama_file(fname.c_str(), "rb", false));
LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
}
}
// Save tensors data offset of the main file. // Save tensors data offset of the main file.
// For subsidiary files, `meta` tensor data offset must not be used, // For subsidiary files, `meta` tensor data offset must not be used,
// so we build a unified tensors index for weights. // so we build a unified tensors index for weights.
@@ -618,7 +572,7 @@ llama_model_loader::llama_model_loader(
} }
} }
files.emplace_back(new llama_file(fname_split, "rb", use_direct_io)); files.emplace_back(new llama_file(fname_split, "rb"));
contexts.emplace_back(ctx); contexts.emplace_back(ctx);
// Save tensors data offset info of the shard. // Save tensors data offset info of the shard.
@@ -762,7 +716,6 @@ llama_model_loader::llama_model_loader(
} }
this->use_mmap = use_mmap; this->use_mmap = use_mmap;
this->use_direct_io = use_direct_io;
this->check_tensors = check_tensors; this->check_tensors = check_tensors;
this->no_alloc = no_alloc; this->no_alloc = no_alloc;
} }
@@ -982,15 +935,7 @@ bool llama_model_loader::load_all_data(
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives. // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
// NVMe raid configurations might require more / larger buffers. // NVMe raid configurations might require more / larger buffers.
constexpr size_t n_buffers = 4; constexpr size_t n_buffers = 4;
constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
size_t alignment = 1;
for (const auto & file : files) {
alignment = std::max(file->read_alignment(), alignment);
}
// Buffer size: balance between memory usage and I/O efficiency
// 64MB works well for NVMe drives
const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
std::vector<ggml_backend_buffer_t> host_buffers; std::vector<ggml_backend_buffer_t> host_buffers;
std::vector<ggml_backend_event_t> events; std::vector<ggml_backend_event_t> events;
@@ -1040,7 +985,6 @@ bool llama_model_loader::load_all_data(
// If the backend is supported, create pinned memory buffers and events for synchronisation. // If the backend is supported, create pinned memory buffers and events for synchronisation.
for (size_t idx = 0; idx < n_buffers; ++idx) { for (size_t idx = 0; idx < n_buffers; ++idx) {
auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size); auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
if (!buf) { if (!buf) {
LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func, LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
ggml_backend_dev_name(dev)); ggml_backend_dev_name(dev));
@@ -1122,7 +1066,6 @@ bool llama_model_loader::load_all_data(
} }
} else { } else {
const auto & file = files.at(weight->idx); const auto & file = files.at(weight->idx);
if (ggml_backend_buffer_is_host(cur->buffer)) { if (ggml_backend_buffer_is_host(cur->buffer)) {
file->seek(weight->offs, SEEK_SET); file->seek(weight->offs, SEEK_SET);
file->read_raw(cur->data, n_size); file->read_raw(cur->data, n_size);
@@ -1134,54 +1077,19 @@ bool llama_model_loader::load_all_data(
} else { } else {
// If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
if (upload_backend) { if (upload_backend) {
size_t offset = weight->offs; file->seek(weight->offs, SEEK_SET);
alignment = file->read_alignment();
size_t aligned_offset = offset & ~(alignment - 1);
size_t offset_from_alignment = offset - aligned_offset;
file->seek(aligned_offset, SEEK_SET);
// Calculate aligned read boundaries
size_t read_start = aligned_offset;
size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
size_t bytes_read = 0; size_t bytes_read = 0;
size_t data_read = 0; // Actual tensor data copied (excluding padding)
while (bytes_read < read_end - read_start) { while (bytes_read < n_size) {
size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read); size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
// Align the destination pointer within the pinned buffer
uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
// Wait for previous upload to complete before reusing buffer
ggml_backend_event_synchronize(events[buffer_idx]); ggml_backend_event_synchronize(events[buffer_idx]);
file->read_raw(host_ptrs[buffer_idx], read_iteration);
// Read aligned chunk from file ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
// Calculate actual data portion (excluding alignment padding)
uintptr_t ptr_data = ptr_dest_aligned;
size_t data_to_copy = read_size;
// Skip alignment padding at start of first chunk
if (bytes_read == 0) {
ptr_data += offset_from_alignment;
data_to_copy -= offset_from_alignment;
}
// Trim alignment padding at end of last chunk
if (aligned_offset + bytes_read + read_size > offset + n_size) {
data_to_copy -= (read_end - (offset + n_size));
}
// Async upload actual data to GPU
ggml_backend_tensor_set_async(upload_backend, cur,
reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
ggml_backend_event_record(events[buffer_idx], upload_backend); ggml_backend_event_record(events[buffer_idx], upload_backend);
data_read += data_to_copy; bytes_read += read_iteration;
bytes_read += read_size;
++buffer_idx; ++buffer_idx;
buffer_idx %= n_buffers; buffer_idx %= n_buffers;
} }

View File

@@ -70,7 +70,6 @@ struct llama_model_loader {
size_t n_bytes = 0; size_t n_bytes = 0;
bool use_mmap = false; bool use_mmap = false;
bool use_direct_io = false;
bool check_tensors; bool check_tensors;
bool no_alloc; bool no_alloc;
@@ -98,7 +97,6 @@ struct llama_model_loader {
const std::string & fname, const std::string & fname,
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
bool use_mmap, bool use_mmap,
bool use_direct_io,
bool check_tensors, bool check_tensors,
bool no_alloc, bool no_alloc,
const llama_model_kv_override * param_overrides_p, const llama_model_kv_override * param_overrides_p,
@@ -133,8 +131,6 @@ struct llama_model_loader {
template<typename T> template<typename T>
bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true); bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true);
bool get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required = true);
std::string get_arch_name() const; std::string get_arch_name() const;
enum llm_arch get_arch() const; enum llm_arch get_arch() const;

View File

@@ -146,9 +146,6 @@ void llama_model_saver::add_kv_from_model() {
add_kv(LLM_KV_VOCAB_SIZE, vocab.n_tokens()); add_kv(LLM_KV_VOCAB_SIZE, vocab.n_tokens());
add_kv(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train); add_kv(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
add_kv(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd); add_kv(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
if (hparams.n_embd_out_impl > 0) {
add_kv(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl);
}
add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer); add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer);
add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true); add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true);

File diff suppressed because it is too large Load Diff

View File

@@ -11,7 +11,6 @@
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set>
#include <vector> #include <vector>
struct llama_cparams; struct llama_cparams;
@@ -25,14 +24,12 @@ enum llm_type {
LLM_TYPE_17M, LLM_TYPE_17M,
LLM_TYPE_22M, LLM_TYPE_22M,
LLM_TYPE_33M, LLM_TYPE_33M,
LLM_TYPE_47M,
LLM_TYPE_60M, LLM_TYPE_60M,
LLM_TYPE_70M, LLM_TYPE_70M,
LLM_TYPE_80M, LLM_TYPE_80M,
LLM_TYPE_109M, LLM_TYPE_109M,
LLM_TYPE_137M, LLM_TYPE_137M,
LLM_TYPE_140M, LLM_TYPE_140M,
LLM_TYPE_149M,
LLM_TYPE_160M, LLM_TYPE_160M,
LLM_TYPE_190M, LLM_TYPE_190M,
LLM_TYPE_220M, LLM_TYPE_220M,
@@ -42,7 +39,6 @@ enum llm_type {
LLM_TYPE_335M, LLM_TYPE_335M,
LLM_TYPE_350M, LLM_TYPE_350M,
LLM_TYPE_360M, LLM_TYPE_360M,
LLM_TYPE_395M,
LLM_TYPE_410M, LLM_TYPE_410M,
LLM_TYPE_450M, LLM_TYPE_450M,
LLM_TYPE_475M, LLM_TYPE_475M,
@@ -121,12 +117,10 @@ enum llm_type {
LLM_TYPE_31B_A3_5B, LLM_TYPE_31B_A3_5B,
LLM_TYPE_80B_A3B, // Qwen3 Next LLM_TYPE_80B_A3B, // Qwen3 Next
LLM_TYPE_100B_A6B, LLM_TYPE_100B_A6B,
LLM_TYPE_102B_A12B, // Solar-Open
LLM_TYPE_106B_A12B, // GLM-4.5-Air LLM_TYPE_106B_A12B, // GLM-4.5-Air
LLM_TYPE_230B_A10B, // Minimax M2 LLM_TYPE_230B_A10B, // Minimax M2
LLM_TYPE_235B_A22B, LLM_TYPE_235B_A22B,
LLM_TYPE_300B_A47B, // Ernie MoE big LLM_TYPE_300B_A47B, // Ernie MoE big
LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
LLM_TYPE_355B_A32B, // GLM-4.5 LLM_TYPE_355B_A32B, // GLM-4.5
LLM_TYPE_E2B, LLM_TYPE_E2B,
LLM_TYPE_E4B, LLM_TYPE_E4B,
@@ -471,6 +465,8 @@ struct llama_model {
struct ggml_tensor * dense_2_out_layers = nullptr; struct ggml_tensor * dense_2_out_layers = nullptr;
struct ggml_tensor * dense_3_out_layers = nullptr; struct ggml_tensor * dense_3_out_layers = nullptr;
llama_model_params params;
// gguf metadata // gguf metadata
std::unordered_map<std::string, std::string> gguf_kv; std::unordered_map<std::string, std::string> gguf_kv;
@@ -480,9 +476,6 @@ struct llama_model {
// for quantize-stats only // for quantize-stats only
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name; std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
// for keeping track of associated LoRA adapters
std::unordered_set<llama_adapter_lora *> loras;
int64_t t_load_us = 0; int64_t t_load_us = 0;
int64_t t_start_us = 0; int64_t t_start_us = 0;
@@ -504,9 +497,6 @@ struct llama_model {
size_t n_tensors() const; size_t n_tensors() const;
size_t n_devices() const; size_t n_devices() const;
uint32_t n_gpu_layers() const;
llama_split_mode split_mode() const;
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const; std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
// total number of parameters in the model // total number of parameters in the model
@@ -535,8 +525,6 @@ struct llama_model {
ggml_cgraph * build_graph(const llm_graph_params & params) const; ggml_cgraph * build_graph(const llm_graph_params & params) const;
private: private:
llama_model_params params;
struct impl; struct impl;
std::unique_ptr<impl> pimpl; std::unique_ptr<impl> pimpl;
}; };

View File

@@ -422,6 +422,57 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
++qs.i_ffn_up; ++qs.i_ffn_up;
} }
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
//}
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
//else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
//}
// This can be used to reduce the size of the Q5_K_S model.
// The associated PPL increase is fully in line with the size reduction
//else {
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
//}
bool convert_incompatible_tensor = false;
{
const int64_t nx = tensor->ne[0];
const int64_t ny = tensor->ne[1];
const int64_t qk_k = ggml_blck_size(new_type);
if (nx % qk_k != 0) {
LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
convert_incompatible_tensor = true;
} else {
++qs.n_k_quantized;
}
}
if (convert_incompatible_tensor) {
switch (new_type) {
case GGML_TYPE_TQ1_0:
case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
}
if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
new_type = GGML_TYPE_F16;
}
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
++qs.n_fallback;
}
return new_type; return new_type;
} }
@@ -545,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
} }
std::vector<std::string> splits = {}; std::vector<std::string> splits = {};
llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
ml.init_mappings(false); // no prefetching ml.init_mappings(false); // no prefetching
llama_model model(llama_model_default_params()); llama_model model(llama_model_default_params());
@@ -824,69 +875,21 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// get more optimal quantization type based on the tensor shape, layer, etc. // get more optimal quantization type based on the tensor shape, layer, etc.
if (!params->pure && ggml_is_quantized(default_type)) { if (!params->pure && ggml_is_quantized(default_type)) {
// if the user provided tensor types - use those int fallback = qs.n_fallback;
bool manual = false; new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
if (params->tensor_types) { // unless the user specifies a type, and the tensor geometry will not require fallback quantisation
if (params->tensor_types && qs.n_fallback - fallback == 0) {
const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types); const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
const std::string tensor_name(tensor->name); const std::string tensor_name(tensor->name);
for (const auto & [tname, qtype] : tensor_types) { for (const auto & [tname, qtype] : tensor_types) {
if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) { if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
if (qtype != new_type) { if (qtype != new_type) {
LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype)); LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
new_type = qtype; // if two or more types are specified for the same tensor, the last match wins new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
manual = true;
break;
} }
} }
} }
} }
// if not manual - use the standard logic for choosing the quantization type based on the selected mixture
if (!manual) {
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
}
// incompatible tensor shapes are handled here - fallback to a compatible type
{
bool convert_incompatible_tensor = false;
const int64_t nx = tensor->ne[0];
const int64_t ny = tensor->ne[1];
const int64_t qk_k = ggml_blck_size(new_type);
if (nx % qk_k != 0) {
LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
convert_incompatible_tensor = true;
} else {
++qs.n_k_quantized;
}
if (convert_incompatible_tensor) {
switch (new_type) {
case GGML_TYPE_TQ1_0:
case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
}
if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
new_type = GGML_TYPE_F16;
}
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
++qs.n_fallback;
}
}
} }
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
new_type = params->token_embedding_type; new_type = params->token_embedding_type;

File diff suppressed because it is too large Load Diff

View File

@@ -14,19 +14,7 @@ struct llama_grammar;
struct llama_sampler_chain { struct llama_sampler_chain {
llama_sampler_chain_params params; llama_sampler_chain_params params;
// has .backend_init() been called? std::vector<struct llama_sampler *> samplers;
bool is_init = false;
struct info {
bool is_backend;
llama_sampler * ptr;
};
std::vector<info> samplers;
// pre-allocated buffer for llama_sampler_sample to avoid repeated allocations
std::vector<llama_token_data> cur;
// timing // timing
@@ -36,9 +24,9 @@ struct llama_sampler_chain {
}; };
struct llama_sampler * llama_sampler_init_dry_testing( struct llama_sampler * llama_sampler_init_dry_testing(
int32_t context_size, int32_t context_size,
float dry_multiplier, float dry_multiplier,
float dry_base, float dry_base,
int32_t dry_allowed_length, int32_t dry_allowed_length,
int32_t dry_penalty_last_n, int32_t dry_penalty_last_n,
const std::vector<std::vector<llama_token>> & seq_breakers); const std::vector<std::vector<llama_token>>& seq_breakers);

View File

@@ -314,12 +314,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+", "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
}; };
break; break;
case LLAMA_VOCAB_PRE_TYPE_YOUTU:
regex_exprs = {
"[가-힣ㄱ-ㆎ]+|[!…“”‘’—:;,、-〿︰-]+|[ㄅ-ㄯ]+|[一-龥぀-ゟ゠-ヿ]+",
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
regex_exprs = { regex_exprs = {
"[\r\n]", "[\r\n]",
@@ -361,7 +355,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
case LLAMA_VOCAB_PRE_TYPE_STABLELM2: case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
case LLAMA_VOCAB_PRE_TYPE_QWEN2: case LLAMA_VOCAB_PRE_TYPE_QWEN2:
case LLAMA_VOCAB_PRE_TYPE_HUNYUAN: case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
case LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN:
regex_exprs = { regex_exprs = {
// original regex from tokenizer.json // original regex from tokenizer.json
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
@@ -461,13 +454,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
}; };
break; break;
case LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE:
regex_exprs = {
// original regex from tokenizer.json
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
};
break;
default: default:
// default regex for BPE tokenization pre-processing // default regex for BPE tokenization pre-processing
regex_exprs = { regex_exprs = {
@@ -1863,11 +1849,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "deepseek-v3") { tokenizer_pre == "deepseek-v3") {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM; pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
clean_spaces = false; clean_spaces = false;
} else if (
tokenizer_pre == "youtu") {
pre_type = LLAMA_VOCAB_PRE_TYPE_YOUTU;
clean_spaces = false;
ignore_merges = true;
} else if ( } else if (
tokenizer_pre == "falcon") { tokenizer_pre == "falcon") {
pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON; pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
@@ -1886,8 +1867,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "jina-v2-es" || tokenizer_pre == "jina-v2-es" ||
tokenizer_pre == "jina-v2-de" || tokenizer_pre == "jina-v2-de" ||
tokenizer_pre == "a.x-4.0" || tokenizer_pre == "a.x-4.0" ||
tokenizer_pre == "mellum" || tokenizer_pre == "mellum") {
tokenizer_pre == "modern-bert" ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2; pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
} else if ( } else if (
tokenizer_pre == "jina-v1-en" || tokenizer_pre == "jina-v1-en" ||
@@ -1961,9 +1941,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
} else if ( } else if (
tokenizer_pre == "exaone4") { tokenizer_pre == "exaone4") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2; pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
} else if (
tokenizer_pre == "exaone-moe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE;
} else if ( } else if (
tokenizer_pre == "chameleon") { tokenizer_pre == "chameleon") {
pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON; pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
@@ -2026,10 +2003,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "minimax-m2") { tokenizer_pre == "minimax-m2") {
pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2; pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
clean_spaces = false; clean_spaces = false;
} else if (
tokenizer_pre == "solar-open") {
pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
clean_spaces = false;
} else { } else {
LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__); LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
@@ -2076,7 +2049,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
scores = (const float * ) gguf_get_arr_data(ctx, score_idx); scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
} }
const uint32_t n_scores = score_idx != -1 ? gguf_get_arr_n(ctx, score_idx) : 0;
const int * toktypes = nullptr; const int * toktypes = nullptr;
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str()); const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
if (toktype_idx != -1) { if (toktype_idx != -1) {
@@ -2098,7 +2070,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
auto & token_data = id_to_token[i]; auto & token_data = id_to_token[i];
token_data.text = std::move(word); token_data.text = std::move(word);
token_data.score = (scores && i < n_scores) ? scores[i] : 0.0f; token_data.score = scores ? scores[i] : 0.0f;
token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
@@ -2204,8 +2176,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
// for now, we apply this workaround to find the tokens based on their text // for now, we apply this workaround to find the tokens based on their text
for (const auto & t : token_to_id) { for (const auto & t : token_to_id) {
auto & attr = id_to_token[t.second].attr;
// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc. // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
if (special_eot_id == LLAMA_TOKEN_NULL) { if (special_eot_id == LLAMA_TOKEN_NULL) {
if (false if (false
@@ -2221,10 +2191,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<end_of_utterance>" // smoldocling || t.first == "<end_of_utterance>" // smoldocling
) { ) {
special_eot_id = t.second; special_eot_id = t.second;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str()); __func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL); id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
} }
} }
} }
@@ -2235,10 +2205,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<|eom_id|>" || t.first == "<|eom_id|>"
) { ) {
special_eom_id = t.second; special_eom_id = t.second;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str()); __func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL); id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
} }
} }
} }
@@ -2255,10 +2225,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<|code_prefix|>" // GLM-4.5 || t.first == "<|code_prefix|>" // GLM-4.5
) { ) {
special_fim_pre_id = t.second; special_fim_pre_id = t.second;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str()); __func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL); id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
} }
} }
} }
@@ -2275,10 +2245,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<|code_suffix|>" // GLM-4.5 || t.first == "<|code_suffix|>" // GLM-4.5
) { ) {
special_fim_suf_id = t.second; special_fim_suf_id = t.second;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str()); __func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL); id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
} }
} }
} }
@@ -2295,10 +2265,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<|code_middle|>" // GLM-4.5 || t.first == "<|code_middle|>" // GLM-4.5
) { ) {
special_fim_mid_id = t.second; special_fim_mid_id = t.second;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str()); __func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL); id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
} }
} }
} }
@@ -2312,10 +2282,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<PAD>" || t.first == "<PAD>"
) { ) {
special_fim_pad_id = t.second; special_fim_pad_id = t.second;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str()); __func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL); id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
} }
} }
} }
@@ -2330,10 +2300,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<reponame>" // Granite || t.first == "<reponame>" // Granite
) { ) {
special_fim_rep_id = t.second; special_fim_rep_id = t.second;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str()); __func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL); id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
} }
} }
} }
@@ -2344,41 +2314,15 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<|file_sep|>" // Qwen || t.first == "<|file_sep|>" // Qwen
) { ) {
special_fim_sep_id = t.second; special_fim_sep_id = t.second;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str()); __func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL); id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
} }
} }
} }
} }
// auto-detect unused tokens: e.g. control tokens with the word "unused"
// ideally, these tokens should be marked as unused during conversion
{
uint32_t n_unused = 0;
for (const auto & t : token_to_id) {
auto & attr = id_to_token[t.second].attr;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
continue;
}
if ((attr & LLAMA_TOKEN_ATTR_UNUSED) == 0) {
if (strstr(t.first.c_str(), "unused") != NULL) {
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_UNUSED);
}
}
if (attr & LLAMA_TOKEN_ATTR_UNUSED) {
n_unused++;
}
}
LLAMA_LOG_INFO("%s: %u unused tokens\n", __func__, n_unused);
}
// maintain a list of tokens that cause end-of-generation // maintain a list of tokens that cause end-of-generation
// this is currently determined based on the token text, which is obviously not ideal // this is currently determined based on the token text, which is obviously not ideal
// ref: https://github.com/ggerganov/llama.cpp/issues/9606 // ref: https://github.com/ggerganov/llama.cpp/issues/9606
@@ -2397,16 +2341,12 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
} }
for (const auto & t : token_to_id) { for (const auto & t : token_to_id) {
auto & attr = id_to_token[t.second].attr;
if (false if (false
|| t.first == "<|eot_id|>" || t.first == "<|eot_id|>"
|| t.first == "<|im_end|>" || t.first == "<|im_end|>"
|| t.first == "<|end|>" || t.first == "<|end|>"
|| t.first == "<|return|>" // o200k_harmony || t.first == "<|return|>" // o200k_harmony
|| t.first == "<|call|>" // o200k_harmony || t.first == "<|call|>" // o200k_harmony
|| t.first == "<|flush|>" // solar-open
|| t.first == "<|calls|>" // solar-open
|| t.first == "<end_of_turn>" || t.first == "<end_of_turn>"
|| t.first == "<|endoftext|>" || t.first == "<|endoftext|>"
|| t.first == "<|eom_id|>" || t.first == "<|eom_id|>"
@@ -2416,31 +2356,24 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<end_of_utterance>" // smoldocling || t.first == "<end_of_utterance>" // smoldocling
) { ) {
special_eog_ids.insert(t.second); special_eog_ids.insert(t.second);
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str()); __func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL); id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
} }
} else { } else {
if (attr & LLAMA_TOKEN_ATTR_CONTROL && !(attr & LLAMA_TOKEN_ATTR_UNUSED)) { // token is control, but not marked as EOG -> print a debug log
// token is control, but not marked as EOG -> print a debug log if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) {
if (special_eog_ids.count(t.second) == 0) { LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n", __func__, t.second, t.first.c_str());
__func__, t.second, t.first.c_str());
}
} }
} }
} }
// @ngxson : quick hack for gpt-oss, always render these tokens // @ngxson : quick hack for gpt-oss, always render these tokens
for (const auto & t : token_to_id) { for (const auto & t : token_to_id) {
auto & attr = id_to_token[t.second].attr;
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") { if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
LLAMA_LOG_WARN("%s: setting token '%s' (%d) attribute to USER_DEFINED (%u), old attributes: %u\n", id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
__func__, t.first.c_str(), t.second, LLAMA_TOKEN_ATTR_USER_DEFINED, attr);
attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
} }
} }
@@ -2460,42 +2393,34 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__); LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
} }
// TODO: workaround for o200k_harmony and solar-open tokenizer: the "<|end|>" token should not be EOG // TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG
// we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens ("<|calls|>" and "<|flush|>" for solar-open), // we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens,
// we remove the "<|end|>" token from the EOG list // we remove the "<|end|>" token from the EOG list
{ {
bool has_return = false; bool has_return = false;
bool has_call = false; bool has_call = false;
bool has_end = false; bool has_end = false;
bool has_flush = false;
llama_token end_id = LLAMA_TOKEN_NULL; llama_token end_id = LLAMA_TOKEN_NULL;
LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__); LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
for (auto tid : special_eog_ids) { for (auto tid : special_eog_ids) {
auto & text = id_to_token[tid].text; LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str());
LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, text.c_str()); if (id_to_token[tid].text == "<|return|>") {
if (text == "<|return|>") {
has_return = true; has_return = true;
} else if (text == "<|call|>" || text == "<|calls|>") { } else if (id_to_token[tid].text == "<|call|>") {
has_call = true; has_call = true;
} else if (text == "<|flush|>") { } else if (id_to_token[tid].text == "<|end|>") {
has_flush = true;
} else if (text == "<|end|>") {
has_end = true; has_end = true;
end_id = tid; end_id = tid;
} }
} }
if ((has_return && has_call && has_end) || (has_call && has_flush && has_end)) { if (has_return && has_call && has_end) {
special_eog_ids.erase(end_id); special_eog_ids.erase(end_id);
id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
auto & attr = id_to_token[end_id].attr; LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
} }
} }
} }
@@ -2593,13 +2518,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) { for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false); _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
} }
} else if (_contains_any(model_name, {"modern-bert"})) {
if (token_to_id.count("[MASK]") == 0 ) {
LLAMA_LOG_WARN("%s: Mask token missing in vocab!\n", __func__);
}
else {
_set_token_attr("[MASK]", LLAMA_TOKEN_ATTR_LSTRIP, true);
}
} }
} }
} }
@@ -3293,34 +3211,34 @@ int32_t llama_vocab::impl::detokenize(
} }
void llama_vocab::impl::print_info() const { void llama_vocab::impl::print_info() const {
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str()); LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str());
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens()); LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens());
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size()); LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
// special tokens // special tokens
if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); } if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); } if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); } if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); } if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); } if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); } if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); } if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); } if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); } if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); } if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); } if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); } if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); } if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); } if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); } if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
for (const auto & id : special_eog_ids) { for (const auto & id : special_eog_ids) {
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() ); LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
} }
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len); LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
} }
llama_vocab::llama_vocab() : pimpl(new impl(*this)) { llama_vocab::llama_vocab() : pimpl(new impl(*this)) {

View File

@@ -51,9 +51,6 @@ enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40, LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41, LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41,
LLAMA_VOCAB_PRE_TYPE_AFMOE = 42, LLAMA_VOCAB_PRE_TYPE_AFMOE = 42,
LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45,
}; };
struct LLM_KV; struct LLM_KV;

View File

@@ -71,9 +71,8 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
}, &ud); }, &ud);
llama_model_params mparams_copy = *mparams; llama_model_params mparams_copy = *mparams;
mparams_copy.no_alloc = true; mparams_copy.no_alloc = true;
mparams_copy.use_mmap = false; mparams_copy.use_mmap = false;
mparams_copy.use_mlock = false;
llama_model * model = llama_model_load_from_file(path_model, mparams_copy); llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
if (model == nullptr) { if (model == nullptr) {
@@ -111,20 +110,8 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
} }
} }
for (size_t i = 0; i < ret.size(); i++) { for (size_t i = 0; i < ret.size(); i++) {
size_t free; size_t free, total;
size_t total;
ggml_backend_dev_memory(model->devices[i], &free, &total); ggml_backend_dev_memory(model->devices[i], &free, &total);
// devices can return 0 bytes for free and total memory if they do not
// have any to report. in this case, we will use the host memory as a fallback
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
if (free == 0 && total == 0) {
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (cpu_dev == nullptr) {
throw std::runtime_error(format("%s: no CPU backend found", __func__));
}
ggml_backend_dev_memory(cpu_dev, &free, &total);
}
ret[i].free = free; ret[i].free = free;
ret[i].total = total; ret[i].total = total;
} }
@@ -152,15 +139,12 @@ enum layer_fraction_t {
}; };
// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue // this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
class llama_params_fit_exception : public std::runtime_error {
using std::runtime_error::runtime_error;
};
static void llama_params_fit_impl( static void llama_params_fit_impl(
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams, const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides, float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) { size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
constexpr int64_t MiB = 1024*1024; constexpr int64_t MiB = 1024*1024;
const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
typedef std::vector<llama_device_memory_data> dmds_t; typedef std::vector<llama_device_memory_data> dmds_t;
const llama_model_params default_mparams = llama_model_default_params(); const llama_model_params default_mparams = llama_model_default_params();
@@ -179,12 +163,6 @@ static void llama_params_fit_impl(
return; return;
} }
std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
margins.reserve(nd);
for (size_t id = 0; id < nd; id++) {
margins.push_back(margins_s[id]);
}
std::vector<std::string> dev_names; std::vector<std::string> dev_names;
{ {
dev_names.reserve(nd); dev_names.reserve(nd);
@@ -202,12 +180,11 @@ static void llama_params_fit_impl(
} }
} }
int64_t sum_free = 0; int64_t sum_total = 0;
int64_t sum_projected_free = 0; int64_t sum_projected_free = 0;
int64_t sum_projected_used = 0; int64_t min_projected_free = INT64_MAX;
int64_t sum_projected_model = 0; int64_t sum_projected_used = 0;
std::vector<int64_t> projected_free_per_device; int64_t sum_projected_ctx = 0;
projected_free_per_device.reserve(nd);
if (nd > 1) { if (nd > 1) {
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__); LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
@@ -217,106 +194,63 @@ static void llama_params_fit_impl(
const int64_t projected_used = dmd.mb.total(); const int64_t projected_used = dmd.mb.total();
const int64_t projected_free = dmd.free - projected_used; const int64_t projected_free = dmd.free - projected_used;
projected_free_per_device.push_back(projected_free);
sum_free += dmd.free; sum_total += dmd.total;
sum_projected_used += projected_used; sum_projected_used += projected_used;
sum_projected_free += projected_free; sum_projected_free += projected_free;
sum_projected_model += dmd.mb.model; min_projected_free = std::min(min_projected_free, projected_free);
sum_projected_ctx += dmd.mb.context;
if (nd > 1) { if (nd > 1) {
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n", LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB); __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB,
projected_free >= 0 ? "surplus" : "deficit");
} }
} }
assert(sum_free >= 0 && sum_projected_used >= 0); assert(sum_total >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0);
assert(sum_projected_used >= sum_projected_ctx);
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n", LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
__func__, sum_projected_used/MiB, sum_free/MiB); __func__, sum_projected_used/MiB, sum_total/MiB);
if (nd == 1) { if (min_projected_free >= margin) {
if (projected_free_per_device[0] >= margins[0]) { if (nd == 1) {
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n", LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB); __func__, min_projected_free/MiB, margin/MiB);
return;
}
} else {
bool changes_needed = false;
for (size_t id = 0; id < nd; id++) {
if (projected_free_per_device[id] < margins[id]) {
changes_needed = true;
break;
}
}
if (!changes_needed) {
LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
return; return;
} }
LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n",
__func__, min_projected_free/MiB, margin/MiB);
return;
} }
// step 2: try reducing memory use by reducing the context size // step 2: try reducing memory use by reducing the context size
{ {
int64_t global_surplus = sum_projected_free; int64_t global_surplus = sum_projected_free - int64_t(nd)*margin;
for (size_t id = 0; id < nd; id++) {
global_surplus -= margins[id];
}
if (global_surplus < 0) { if (global_surplus < 0) {
if (nd == 1) { LLAMA_LOG_INFO(nd == 1 ?
LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n", "%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n" :
__func__, margins[0]/MiB, -global_surplus/MiB); "%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n",
} else { __func__, margin/MiB, -global_surplus/MiB);
LLAMA_LOG_INFO(
"%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
__func__, -global_surplus/MiB);
}
if (cparams->n_ctx == 0) { if (cparams->n_ctx == 0) {
if (hp_nct > n_ctx_min) { if (hp_nct > n_ctx_min) {
int64_t sum_used_target = sum_free; const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct;
for (size_t id = 0; id < nd; id++) { const uint32_t ctx_reduction = std::min(
sum_used_target -= margins[id]; uint32_t((-global_surplus + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min);
} cparams->n_ctx = hp_nct - ctx_reduction;
if (nd > 1) { const int64_t memory_reduction = ctx_reduction * bytes_per_ctx;
// for multiple devices we need to be more conservative in terms of how much context we think can fit: global_surplus += memory_reduction;
// - for dense models only whole layers can be assigned to devices LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
// - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
// - on average we expect a waste of 0.5 layers/tensors per device if (global_surplus >= 0) {
// - use slightly more than the expected average for nd devices to be safe
const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
}
int64_t sum_projected_used_min_ctx = 0;
cparams->n_ctx = n_ctx_min;
const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
for (const auto & dmd : dmds_min_ctx) {
sum_projected_used_min_ctx += dmd.mb.total();
}
if (sum_used_target > sum_projected_used_min_ctx) {
// linear interpolation between minimum and maximum context size:
cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
/ (sum_projected_used - sum_projected_used_min_ctx);
cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
if (nd == 1) { if (nd == 1) {
LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__); LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
return; return;
} }
LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__); LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
} else {
const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
} }
} else { } else {
if (n_ctx_min == UINT32_MAX) { LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
LLAMA_LOG_INFO("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct); __func__, hp_nct, n_ctx_min);
} else {
LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
__func__, hp_nct, n_ctx_min);
}
} }
} else { } else {
LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx); LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
@@ -325,28 +259,32 @@ static void llama_params_fit_impl(
} }
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) { if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort"); throw std::runtime_error("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
} }
if (nd > 1) { if (nd > 1) {
if (!tensor_split) { if (!tensor_split) {
throw llama_params_fit_exception("did not provide a buffer to write the tensor_split to, abort"); throw std::runtime_error("did not provide a buffer to write the tensor_split to, abort");
} }
if (mparams->tensor_split) { if (mparams->tensor_split) {
for (size_t id = 0; id < nd; id++) { for (size_t id = 0; id < nd; id++) {
if (mparams->tensor_split[id] != 0.0f) { if (mparams->tensor_split[id] != 0.0f) {
throw llama_params_fit_exception("model_params::tensor_split already set by user, abort"); throw std::runtime_error("model_params::tensor_split already set by user, abort");
} }
} }
} }
if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) { if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
throw llama_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort"); throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
}
if (hp_ngl < 2*nd) {
throw std::runtime_error("model has only " + std::to_string(hp_ngl) + " layers but need at least "
+ std::to_string(2*nd) + " to fit memory for " + std::to_string(nd) + " devices, abort");
} }
} }
if (!tensor_buft_overrides) { if (!tensor_buft_overrides) {
throw llama_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort"); throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
} }
if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) { if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
throw llama_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort"); throw std::runtime_error("model_params::tensor_buft_overrides already set by user, abort");
} }
// step 3: iteratively fill the back to front with "dense" layers // step 3: iteratively fill the back to front with "dense" layers
@@ -399,11 +337,6 @@ static void llama_params_fit_impl(
// for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE: // for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
layer_fraction_t overflow_type = LAYER_FRACTION_MOE; layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
uint32_t n_full() const {
assert(n_layer >= n_part);
return n_layer - n_part;
}
}; };
const size_t ntbo = llama_max_tensor_buft_overrides(); const size_t ntbo = llama_max_tensor_buft_overrides();
@@ -412,7 +345,8 @@ static void llama_params_fit_impl(
auto set_ngl_tensor_split_tbo = [&]( auto set_ngl_tensor_split_tbo = [&](
const std::vector<ngl_t> & ngl_per_device, const std::vector<ngl_t> & ngl_per_device,
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts, const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
llama_model_params & mparams) { llama_model_params & mparams,
const bool add_nonrepeating) {
mparams.n_gpu_layers = 0; mparams.n_gpu_layers = 0;
for (size_t id = 0; id < nd; id++) { for (size_t id = 0; id < nd; id++) {
mparams.n_gpu_layers += ngl_per_device[id].n_layer; mparams.n_gpu_layers += ngl_per_device[id].n_layer;
@@ -420,25 +354,29 @@ static void llama_params_fit_impl(
tensor_split[id] = ngl_per_device[id].n_layer; tensor_split[id] = ngl_per_device[id].n_layer;
} }
} }
assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1); assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl);
uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides uint32_t il0 = hp_ngl - mparams.n_gpu_layers; // start index for tensor buft overrides
if (add_nonrepeating) {
mparams.n_gpu_layers += 1;
tensor_split[nd - 1] += 1;
}
mparams.tensor_split = tensor_split; mparams.tensor_split = tensor_split;
size_t itbo = 0; size_t itbo = 0;
for (size_t id = 0; id < nd; id++) { for (size_t id = 0; id < nd; id++) {
il0 += ngl_per_device[id].n_full(); il0 += ngl_per_device[id].n_layer - ngl_per_device[id].n_part;
for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) { for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
if (itbo + 1 >= ntbo) { if (itbo + 1 >= ntbo) {
tensor_buft_overrides[itbo].pattern = nullptr; tensor_buft_overrides[itbo].pattern = nullptr;
tensor_buft_overrides[itbo].buft = nullptr; tensor_buft_overrides[itbo].buft = nullptr;
itbo++; itbo++;
mparams.tensor_buft_overrides = tensor_buft_overrides; mparams.tensor_buft_overrides = tensor_buft_overrides;
throw llama_params_fit_exception("llama_max_tensor_buft_overrides() == " throw std::runtime_error("llama_params_fit_n_tensor_buft_overrides() == "
+ std::to_string(ntbo) + " is insufficient for model"); + std::to_string(ntbo) + " is insufficient for model\n");
} }
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE); tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type(); tensor_buft_overrides[itbo].buft = overflow_bufts[id];
itbo++; itbo++;
} }
il0 += ngl_per_device[id].n_part; il0 += ngl_per_device[id].n_part;
@@ -453,9 +391,10 @@ static void llama_params_fit_impl(
auto get_memory_for_layers = [&]( auto get_memory_for_layers = [&](
const char * func_name, const char * func_name,
const std::vector<ngl_t> & ngl_per_device, const std::vector<ngl_t> & ngl_per_device,
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> { const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
const bool add_nonrepeating) -> std::vector<int64_t> {
llama_model_params mparams_copy = *mparams; llama_model_params mparams_copy = *mparams;
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy); set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy, add_nonrepeating);
const dmds_t dmd_nl = llama_get_device_memory_data( const dmds_t dmd_nl = llama_get_device_memory_data(
path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
@@ -488,9 +427,9 @@ static void llama_params_fit_impl(
const dmds_t dmds_cpu_moe = llama_get_device_memory_data( const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
for (size_t id = 0; id < nd; id++) { for (const llama_device_memory_data & dmd : dmds_cpu_moe) {
global_surplus_cpu_moe += dmds_cpu_moe[id].free; global_surplus_cpu_moe += dmd.free;
global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id]; global_surplus_cpu_moe -= int64_t(dmd.mb.total()) + margin;
} }
if (global_surplus_cpu_moe > 0) { if (global_surplus_cpu_moe > 0) {
@@ -509,18 +448,27 @@ static void llama_params_fit_impl(
std::vector<int64_t> targets; // maximum acceptable memory use per device std::vector<int64_t> targets; // maximum acceptable memory use per device
targets.reserve(nd); targets.reserve(nd);
for (size_t id = 0; id < nd; id++) { for (size_t id = 0; id < nd; id++) {
targets.push_back(dmds_full[id].free - margins[id]); targets.push_back(dmds_full[id].free - margin);
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB); LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
} }
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to: // whether for the optimal memory use we expect to load at least some MoE tensors:
const bool partial_moe = hp_nex > 0 && global_surplus_cpu_moe > 0;
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
overflow_bufts.reserve(nd); overflow_bufts.reserve(nd);
for (size_t id = 0; id < nd; id++) { for (size_t id = 0; id < nd - 1; ++id) {
overflow_bufts.push_back(ggml_backend_cpu_buffer_type()); overflow_bufts.push_back(ggml_backend_dev_buffer_type(devs[id + 1]));
} }
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
std::vector<ngl_t> ngl_per_device(nd); std::vector<ngl_t> ngl_per_device(nd);
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts); std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts, partial_moe);
if (hp_nex > 0) {
for (size_t id = 0; id < nd; id++) {
ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
}
}
// optimize the number of layers per device using the method of false position: // optimize the number of layers per device using the method of false position:
// - ngl_per_device has 0 layers for each device, lower bound // - ngl_per_device has 0 layers for each device, lower bound
@@ -528,30 +476,22 @@ static void llama_params_fit_impl(
// - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target // - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
// - check memory use of our guess, replace either the low or high bound // - check memory use of our guess, replace either the low or high bound
// - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits // - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
// - the last device has the output layer, which cannot be a partial layer
if (hp_nex == 0) { if (hp_nex == 0) {
LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__); LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
} else { } else {
LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__); LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
} }
uint32_t n_unassigned = hp_ngl;
for (int id = nd - 1; id >= 0; id--) { for (int id = nd - 1; id >= 0; id--) {
uint32_t n_unassigned = hp_ngl + 1;
for (size_t jd = id + 1; jd < nd; ++jd) {
assert(n_unassigned >= ngl_per_device[jd].n_layer);
n_unassigned -= ngl_per_device[jd].n_layer;
}
std::vector<ngl_t> ngl_per_device_high = ngl_per_device; std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
ngl_per_device_high[id].n_layer = n_unassigned; ngl_per_device_high[id].n_layer = n_unassigned;
if (hp_nex > 0) { if (hp_nex > 0) {
ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1; ngl_per_device_high[id].n_part = ngl_per_device_high[id].n_layer;
} }
if (ngl_per_device_high[id].n_layer > 0) { if (ngl_per_device_high[id].n_layer > 0) {
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts); std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
if (mem_high[id] > targets[id]) { if (mem_high[id] > targets[id]) {
assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer; uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
while (delta > 1) { while (delta > 1) {
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]); uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
step_size = std::max(step_size, uint32_t(1)); step_size = std::max(step_size, uint32_t(1));
@@ -560,26 +500,25 @@ static void llama_params_fit_impl(
std::vector<ngl_t> ngl_per_device_test = ngl_per_device; std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
ngl_per_device_test[id].n_layer += step_size; ngl_per_device_test[id].n_layer += step_size;
if (hp_nex) { if (hp_nex) {
ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ? ngl_per_device_test[id].n_part += step_size;
step_size - 1 : step_size; // the first layer is the output layer which must always be full
} }
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts); const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
if (mem_test[id] <= targets[id]) { if (mem_test[id] <= targets[id]) {
ngl_per_device = ngl_per_device_test; ngl_per_device = ngl_per_device_test;
mem = mem_test; mem = mem_test;
n_unassigned -= ngl_per_device[id].n_layer;
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer); LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
} else { } else {
ngl_per_device_high = ngl_per_device_test; ngl_per_device_high = ngl_per_device_test;
mem_high = mem_test; mem_high = mem_test;
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer); LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
} }
delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer; delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
} }
} else { } else {
assert(ngl_per_device_high[id].n_layer == n_unassigned); ngl_per_device = ngl_per_device_high;
ngl_per_device = ngl_per_device_high; n_unassigned -= ngl_per_device[id].n_layer;
mem = mem_high;
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer); LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
} }
} }
@@ -590,7 +529,7 @@ static void llama_params_fit_impl(
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB); __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
} }
if (hp_nex == 0 || global_surplus_cpu_moe <= 0) { if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams); set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
return; return;
} }
@@ -610,20 +549,24 @@ static void llama_params_fit_impl(
assert(id_dense_start < nd); assert(id_dense_start < nd);
LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__); LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) { for (size_t id = 0; id <= id_dense_start; id++) {
std::vector<ngl_t> ngl_per_device_high = ngl_per_device; std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
for (size_t jd = id_dense_start; jd < nd; jd++) { for (size_t jd = id_dense_start; jd < nd; jd++) {
const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1; const uint32_t n_layer_move = ngl_per_device_high[jd].n_layer;
ngl_per_device_high[id].n_layer += n_layer_move; ngl_per_device_high[id].n_layer += n_layer_move;
ngl_per_device_high[jd].n_layer -= n_layer_move; ngl_per_device_high[jd].n_layer -= n_layer_move;
ngl_per_device_high[jd].n_part = 0; ngl_per_device_high[jd].n_part = 0;
} }
size_t id_dense_start_high = nd - 1; size_t id_dense_start_high = nd - 1;
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts); std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
if (mem_high[id] > targets[id]) { if (mem_high[id] > targets[id]) {
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full()); assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full(); assert(ngl_per_device[id].n_layer >= ngl_per_device[id].n_part);
assert((ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
>= ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
uint32_t delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
- (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
while (delta > 1) { while (delta > 1) {
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]); uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
step_size = std::max(step_size, uint32_t(1)); step_size = std::max(step_size, uint32_t(1));
@@ -639,11 +582,11 @@ static void llama_params_fit_impl(
ngl_per_device_test[id].n_layer += n_convert_jd; ngl_per_device_test[id].n_layer += n_convert_jd;
n_converted_test += n_convert_jd; n_converted_test += n_convert_jd;
if (ngl_per_device_test[id_dense_start_test].n_part > 0) { if (ngl_per_device_test[id_dense_start_test].n_layer > 0) {
break; break;
} }
} }
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts); const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
if (mem_test[id] <= targets[id]) { if (mem_test[id] <= targets[id]) {
ngl_per_device = ngl_per_device_test; ngl_per_device = ngl_per_device_test;
@@ -658,38 +601,32 @@ static void llama_params_fit_impl(
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n", LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high); __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
} }
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full()); delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full(); - (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
} }
} else { } else {
ngl_per_device = ngl_per_device_high; ngl_per_device = ngl_per_device_high;
mem = mem_high;
id_dense_start = id_dense_start_high; id_dense_start = id_dense_start_high;
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n", LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start); __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
} }
// try to fit at least part of one more layer // try to fit at least part of one more layer
if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) { if (ngl_per_device[id_dense_start].n_layer > 0) {
std::vector<ngl_t> ngl_per_device_test = ngl_per_device; std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
size_t id_dense_start_test = id_dense_start; size_t id_dense_start_test = id_dense_start;
ngl_per_device_test[id_dense_start_test].n_layer--; ngl_per_device_test[id_dense_start_test].n_layer--;
ngl_per_device_test[id_dense_start_test].n_part--; ngl_per_device_test[id_dense_start_test].n_part--;
ngl_per_device_test[id].n_layer++; ngl_per_device_test[id].n_layer++;
ngl_per_device_test[id].n_part++; ngl_per_device_test[id].n_part++;
if (ngl_per_device_test[id_dense_start_test].n_part == 0) { if (ngl_per_device_test[id_dense_start_test].n_layer == 0) {
id_dense_start_test++; id_dense_start_test++;
} }
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP; ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
if (id < nd - 1) {
overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
}
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__); LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test); std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) { if (mem_test[id] < targets[id]) {
ngl_per_device = ngl_per_device_test; ngl_per_device = ngl_per_device_test;
overflow_bufts = overflow_bufts_test;
mem = mem_test; mem = mem_test;
id_dense_start = id_dense_start_test; id_dense_start = id_dense_start_test;
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n", LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
@@ -697,10 +634,9 @@ static void llama_params_fit_impl(
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE; ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__); LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test); mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) { if (mem_test[id] < targets[id]) {
ngl_per_device = ngl_per_device_test; ngl_per_device = ngl_per_device_test;
overflow_bufts = overflow_bufts_test;
mem = mem_test; mem = mem_test;
id_dense_start = id_dense_start_test; id_dense_start = id_dense_start_test;
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n", LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
@@ -709,10 +645,9 @@ static void llama_params_fit_impl(
} else { } else {
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN; ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__); LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test); mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) { if (mem_test[id] < targets[id]) {
ngl_per_device = ngl_per_device_test; ngl_per_device = ngl_per_device_test;
overflow_bufts = overflow_bufts_test;
mem = mem_test; mem = mem_test;
id_dense_start = id_dense_start_test; id_dense_start = id_dense_start_test;
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n", LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
@@ -727,41 +662,30 @@ static void llama_params_fit_impl(
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB); __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
} }
// print info for devices that were not changed during the conversion from dense only to full layers: set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
for (size_t id = id_dense_start + 1; id < nd; id++) {
const int64_t projected_margin = dmds_full[id].free - mem[id];
LLAMA_LOG_INFO(
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
}
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
} }
enum llama_params_fit_status llama_params_fit( bool llama_params_fit(
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams, const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides, float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) { size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
const int64_t t0_us = llama_time_us(); const int64_t t0_us = llama_time_us();
llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS; bool ok = true;
try { try {
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level); llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__); LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
} catch (const llama_params_fit_exception & e) {
LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
status = LLAMA_PARAMS_FIT_STATUS_FAILURE;
} catch (const std::runtime_error & e) { } catch (const std::runtime_error & e) {
LLAMA_LOG_ERROR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what()); LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
status = LLAMA_PARAMS_FIT_STATUS_ERROR; ok = false;
} }
const int64_t t1_us = llama_time_us(); const int64_t t1_us = llama_time_us();
LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6); LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
return status; return ok;
} }
struct llama_sampler_chain_params llama_sampler_chain_default_params() { struct llama_sampler_chain_params llama_sampler_chain_default_params() {
struct llama_sampler_chain_params result = { struct llama_sampler_chain_params result = {
/*.no_perf =*/ true, /*.no_perf =*/ true,
}; };
return result; return result;
@@ -834,7 +758,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
model.t_start_us = tm.t_start_us; model.t_start_us = tm.t_start_us;
try { try {
llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides); llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
ml.print_info(); ml.print_info();
@@ -1097,55 +1021,25 @@ int32_t llama_chat_apply_template(
// model split // model split
// //
int32_t llama_split_path( int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
char * split_path,
size_t maxlen,
const char * path_prefix,
int32_t split_no,
int32_t split_count) {
static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf"; static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
const int written = snprintf( return strlen(split_path);
split_path,
maxlen,
SPLIT_PATH_FORMAT,
path_prefix,
split_no + 1,
split_count
);
if (written < 0 || (size_t) written >= maxlen) {
return 0;
} }
return 0;
return (int32_t) written;
} }
int32_t llama_split_prefix( int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) {
char * split_prefix, std::string str_split_path(split_path);
size_t maxlen,
const char * split_path,
int32_t split_no,
int32_t split_count) {
const std::string str_split_path(split_path);
char postfix[32]; char postfix[32];
snprintf(postfix, sizeof(postfix), "-%05d-of-%05d.gguf", split_no + 1, split_count); snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
std::string str_postfix(postfix);
const std::string str_postfix(postfix); // check if split_prefix ends with postfix
if (str_split_path.size() <= str_postfix.size()) { int size_prefix = str_split_path.size() - str_postfix.size();
return 0; if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
} snprintf(split_prefix, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
return size_prefix;
const size_t size_prefix = str_split_path.size() - str_postfix.size();
if (str_split_path.compare(size_prefix, std::string::npos, str_postfix) == 0) {
const size_t copy_len = std::min(size_prefix + 1, maxlen);
snprintf(split_prefix, copy_len, "%s", split_path);
return (int32_t) size_prefix;
} }
return 0; return 0;

View File

@@ -22,15 +22,8 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
const float kq_scale = 1.0f/sqrtf(float(n_embd_head)); const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
const float freq_base_l = model.get_rope_freq_base (cparams, il);
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
ggml_tensor * inpSA = inpL; ggml_tensor * inpSA = inpL;
// This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
(il + 1) % hparams.n_no_rope_layer_step != 0;
// dual attention normalization (pre) // dual attention normalization (pre)
cur = build_norm(inpL, cur = build_norm(inpL,
model.layers[il].attn_norm, NULL, model.layers[il].attn_norm, NULL,
@@ -63,16 +56,19 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
cb(Qcur, "Qcur_normed", il); cb(Qcur, "Qcur_normed", il);
cb(Kcur, "Kcur_normed", il); cb(Kcur, "Kcur_normed", il);
// RoPE only for sliding_attention layers
const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
((il + 1) % hparams.n_no_rope_layer_step) != 0;
if (use_rope) { if (use_rope) {
Qcur = ggml_rope_ext( Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr, ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow); ext_factor, attn_factor, beta_fast, beta_slow);
cb(Qcur, "Qcur_rope", il); cb(Qcur, "Qcur_rope", il);
Kcur = ggml_rope_ext( Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr, ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow); ext_factor, attn_factor, beta_fast, beta_slow);
cb(Kcur, "Kcur_rope", il); cb(Kcur, "Kcur_rope", il);
} }

View File

@@ -142,13 +142,11 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
LLM_FFN_GELU, LLM_FFN_SEQ, il); LLM_FFN_GELU, LLM_FFN_SEQ, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) { } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
const bool up_contains_gate = !model.layers[il].ffn_gate && model.layers[il].ffn_up->ne[1] != hparams.n_ff();
auto type_op = up_contains_gate ? LLM_FFN_GEGLU : LLM_FFN_GELU;
cur = build_ffn(cur, cur = build_ffn(cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
type_op, LLM_FFN_PAR, il); model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
} else { } else {
cur = build_ffn(cur, cur = build_ffn(cur,

View File

@@ -3,14 +3,12 @@
llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) : llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) { llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_head = hparams.n_embd_head_v;
const float kq_scale = 1.0f / sqrtf(float(n_embd_head)); float kq_scale = 1.0f / sqrtf(float(n_embd_head));
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot); GGML_ASSERT(n_embd_head == hparams.n_rot);
ggml_tensor * inpL; ggml_tensor *inpL, *cur;
ggml_tensor * cur;
inpL = build_inp_embd(model.tok_embd); inpL = build_inp_embd(model.tok_embd);
ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_pos = build_inp_pos();
@@ -46,7 +44,7 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa
} }
ggml_tensor * inpSA = inpL; ggml_tensor * inpSA = inpL;
cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
// build self attention // build self attention
{ {

View File

@@ -21,9 +21,6 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
const bool is_swa = hparams.is_swa(il); const bool is_swa = hparams.is_swa(il);
// UNUSED:
// const float freq_base_l = model.get_rope_freq_base (cparams, il);
// const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
// norm // norm
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il); cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);

View File

@@ -2,11 +2,14 @@
llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) : llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) { llm_graph_context(params) {
const bool is_mla = hparams.is_mla(); // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
const int64_t n_embd_head_k = hparams.n_embd_head_k_mla(); const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
const int64_t n_embd_head_v = hparams.n_embd_head_v_mla(); const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
const int64_t n_embd_head_qk_rope = hparams.n_rot; const int64_t n_embd_head_qk_rope = hparams.n_rot;
const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope; const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
@@ -40,8 +43,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
// inp_pos - contains the positions // inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn_kv = !is_mla ? build_attn_inp_kv() : nullptr; auto * inp_attn = build_attn_inp_kv();
auto * inp_attn_k = is_mla ? build_attn_inp_k() : nullptr;
ggml_tensor * inp_out_ids = build_inp_out_ids(); ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -55,9 +57,6 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
// self_attention // self_attention
{ {
ggml_tensor * q = NULL; ggml_tensor * q = NULL;
const bool is_lite = model.layers[il].wq;
if (!is_lite) { if (!is_lite) {
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
cb(q, "q", il); cb(q, "q", il);
@@ -125,14 +124,14 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
// {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens} // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
// note: rope must go first for in-place context shifting in build_rope_shift() // note: rope must go first for in-place context shifting in build_rope_shift()
ggml_tensor * Qcur = ggml_concat(ctx0, q_nope_absorbed, q_pe, 0); ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens); kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
cb(kv_cmpr, "kv_cmpr_reshape", il); cb(kv_cmpr, "kv_cmpr_reshape", il);
// {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens} // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
ggml_tensor * Kcur = ggml_concat(ctx0, kv_cmpr, k_pe, 0); ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
// {kv_lora_rank, 1, n_tokens} // {kv_lora_rank, 1, n_tokens}
@@ -146,7 +145,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
} }
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group) // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
cur = build_attn(inp_attn_k, cur = build_attn(inp_attn,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il); Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
} else { } else {
@@ -170,10 +169,11 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
Vcur = ggml_cont(ctx0, Vcur); Vcur = ggml_cont(ctx0, Vcur);
cb(Vcur, "Vcur_cont", il); cb(Vcur, "Vcur_cont", il);
ggml_tensor * Qcur = ggml_concat(ctx0, q_nope, q_pe, 0); // note: rope must go first for in-place context shifting in build_rope_shift()
ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
if (inp_attn_scale) { if (inp_attn_scale) {
@@ -183,7 +183,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
} }
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups) // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
cur = build_attn(inp_attn_kv, cur = build_attn(inp_attn,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
} }
@@ -215,7 +215,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
model.layers[il].ffn_exp_probs_b, model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used, n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm, LLM_FFN_SILU, hparams.expert_weights_norm,
hparams.expert_weights_scale, hparams.expert_weights_scale, true, hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func, (llama_expert_gating_func_type) hparams.expert_gating_func,
il); il);
cb(moe_out, "ffn_moe_out", il); cb(moe_out, "ffn_moe_out", il);

View File

@@ -1,146 +0,0 @@
#include "models.h"
llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_k;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
GGML_ASSERT(n_embd_head == hparams.n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
inpL = build_inp_embd(model.tok_embd);
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn_iswa = build_attn_inp_kv_iswa();
ggml_tensor * inp_out_ids = build_inp_out_ids();
const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
for (int il = 0; il < n_transformer_layers; ++il) {
ggml_tensor * inpSA = inpL;
// use RoPE for SWA layers
const bool is_local_layer = hparams.is_swa(il);
// norm
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
// self-attention
{
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
cb(Qcur, "Qcur_normed", il);
cb(Kcur, "Kcur_normed", il);
if (is_local_layer) {
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base,
freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base,
freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
}
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn_iswa,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
cb(cur, "attn_out", il);
}
if (il == n_transformer_layers - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// norm
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
// feed-forward network
if (model.layers[il].ffn_gate_inp == nullptr) {
// dense branch
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else {
// MoE branch
ggml_tensor * moe_out = build_moe_ffn(cur,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
true, hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);
// FFN shared expert
{
ggml_tensor * ffn_shexp =
build_ffn(cur,
model.layers[il].ffn_up_shexp, NULL, NULL,
model.layers[il].ffn_gate_shexp, NULL, NULL,
model.layers[il].ffn_down_shexp, NULL, NULL,
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(ffn_shexp, "ffn_shexp", il);
cur = ggml_add(ctx0, moe_out, ffn_shexp);
cb(cur, "ffn_out", il);
}
}
cur = ggml_add(ctx0, cur, ffn_inp);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
// final norm
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}

View File

@@ -1,5 +1,7 @@
#include "models.h" #include "models.h"
llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) : llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) { llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_k; const int64_t n_embd_head = hparams.n_embd_head_k;
@@ -10,8 +12,10 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model,
inpL = build_inp_embd(model.tok_embd); inpL = build_inp_embd(model.tok_embd);
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings) // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f); if (ubatch.token) {
cb(inpL, "inp_scaled", -1); inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
cb(inpL, "inp_scaled", -1);
}
// inp_pos - contains the positions // inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_pos = build_inp_pos();

View File

@@ -19,9 +19,6 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
ggml_tensor * inp_out_ids = build_inp_out_ids(); ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
const float freq_base_l = model.get_rope_freq_base (cparams, il);
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
// norm // norm
cur = build_norm(inpL, cur = build_norm(inpL,
model.layers[il].attn_norm, NULL, model.layers[il].attn_norm, NULL,
@@ -46,12 +43,12 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
Qcur = ggml_rope_ext( Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr, ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow); ext_factor, attn_factor, beta_fast, beta_slow);
Kcur = ggml_rope_ext( Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr, ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow); ext_factor, attn_factor, beta_fast, beta_slow);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);

View File

@@ -10,9 +10,10 @@ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_gr
inpL = build_inp_embd(model.tok_embd); inpL = build_inp_embd(model.tok_embd);
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings) // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f); if (ubatch.token) {
cb(inpL, "inp_scaled", -1); inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
cb(inpL, "inp_scaled", -1);
}
// inp_pos - contains the positions // inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_pos = build_inp_pos();

View File

@@ -1,5 +1,7 @@
#include "models.h" #include "models.h"
llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) : llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params), llm_graph_context(params),
model(model), model(model),
@@ -13,9 +15,10 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
inpL = build_inp_embd(model.tok_embd); inpL = build_inp_embd(model.tok_embd);
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings) // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f); if (ubatch.token) {
cb(inpL, "inp_scaled", -1); inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
cb(inpL, "inp_scaled", -1);
}
// inp_pos - contains the positions // inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_pos = build_inp_pos();
@@ -245,30 +248,20 @@ ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) {
// equivalent to get_per_layer_inputs() in python code // equivalent to get_per_layer_inputs() in python code
// output shape: [n_embd_altup, n_layer, n_tokens] // output shape: [n_embd_altup, n_layer, n_tokens]
ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() { ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
auto inp = std::make_unique<llm_graph_input_embd>(n_embd); auto inp = std::make_unique<llm_graph_input_embd>();
ggml_tensor * inp_per_layer; ggml_tensor * inp_per_layer;
if (ubatch.token) { if (ubatch.token) {
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
ggml_set_input(inp->tokens); ggml_set_input(inp->tokens);
res->t_inp_tokens = inp->tokens; res->t_tokens = inp->tokens;
inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens); inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens);
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens); inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup)); inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
cb(inp_per_layer, "inp_per_layer_selected", -1); cb(inp_per_layer, "inp_per_layer_selected", -1);
res->add_input(std::move(inp));
} else { } else {
// Vision embedding path: use padding token (ID=0) embedding GGML_ABORT("TODO: support embd input");
// TODO: verify if this is the correct behavior in transformers implementation
const int64_t embd_size = model.tok_embd_per_layer->ne[0]; // n_embd_altup * n_layer
// Extract and dequantize padding token embedding (row 0)
ggml_tensor * padding = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
inp_per_layer = ggml_cast(ctx0, padding, GGML_TYPE_F32);
// Reshape to [n_embd_altup, n_layer, 1]
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1);
cb(inp_per_layer, "inp_per_layer_vision", -1);
} }
res->add_input(std::move(inp));
return inp_per_layer; return inp_per_layer;
} }
@@ -286,7 +279,7 @@ ggml_tensor * llm_build_gemma3n_iswa::project_per_layer_inputs(ggml_tensor * inp
-1); // [n_embd_altup, n_layer, n_tokens] -1); // [n_embd_altup, n_layer, n_tokens]
cb(per_layer_proj, "per_layer_proj", -1); cb(per_layer_proj, "per_layer_proj", -1);
inp_per_layer = ggml_add(ctx0, per_layer_proj, inp_per_layer); inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale); inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
cb(inp_per_layer, "inp_per_layer", -1); cb(inp_per_layer, "inp_per_layer", -1);

View File

@@ -25,12 +25,8 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
ggml_tensor * inp_out_ids = build_inp_out_ids(); ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
const float freq_base_l = model.get_rope_freq_base (cparams, il);
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
ggml_tensor * inpSA = inpL; ggml_tensor * inpSA = inpL;
// This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
const bool use_rope = hparams.n_no_rope_layer_step > 0 && const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
(il + 1) % hparams.n_no_rope_layer_step != 0; (il + 1) % hparams.n_no_rope_layer_step != 0;
@@ -71,13 +67,13 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
if (use_rope) { if (use_rope) {
Qcur = ggml_rope_ext( Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, rope_factors, ctx0, Qcur, inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
Kcur = ggml_rope_ext( Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, rope_factors, ctx0, Kcur, inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
} else if (inp_attn_scale) { } else if (inp_attn_scale) {

View File

@@ -1,7 +1,6 @@
#include "models.h" #include "models.h"
template <bool embed> llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -15,14 +14,7 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
// inp_pos - contains the positions // inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_pos = build_inp_pos();
using inp_attn_type = std::conditional_t<embed, llm_graph_input_attn_no_cache, llm_graph_input_attn_kv>; auto * inp_attn = build_attn_inp_kv();
inp_attn_type * inp_attn = nullptr;
if constexpr (embed) {
inp_attn = build_attn_inp_no_cache();
} else {
inp_attn = build_attn_inp_kv();
}
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
@@ -153,16 +145,11 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
res->t_embd = cur; res->t_embd = cur;
if constexpr (!embed) { // lm_head
// lm_head cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
res->t_logits = cur; res->t_logits = cur;
}
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
} }
template struct llm_build_llama<false>;
template struct llm_build_llama<true>;

View File

@@ -1,117 +0,0 @@
#include "models.h"
llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
inpL = build_inp_embd(model.tok_embd);
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
// norm
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
// self-attention
{
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
cb(Qcur, "Qcur_normed", il);
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
cb(Kcur, "Kcur_normed", il);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
cur = ggml_add(ctx0, cur, ffn_inp);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = build_norm(cur,
model.output_norm, NULL,
LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}

View File

@@ -1,123 +0,0 @@
#include "models.h"
llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
ggml_tensor * cur;
ggml_tensor * inpL;
inpL = build_inp_embd(model.tok_embd);
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_iswa();
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
uint32_t n_head_l = hparams.n_head(il);
uint32_t n_head_kv_l = hparams.n_head_kv(il);
const float freq_base_l = model.get_rope_freq_base(cparams, il);
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
cur = inpL;
// self_attention
{
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head_l, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv_l, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv_l, n_tokens);
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
ggml_tensor * sinks = model.layers[il].attn_sinks;
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, sinks, nullptr, 1.0f/sqrtf(float(n_embd_head_k)), il);
}
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
// feed-forward network
if (model.layers[il].ffn_gate_inp == nullptr) {
// dense branch
cur = build_ffn(cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else {
// MoE branch
cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false,
0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, il);
cb(cur, "ffn_moe_out", il);
}
cur = ggml_add(ctx0, cur, ffn_inp);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = build_norm(cur,
model.output_norm, NULL,
LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}

View File

@@ -9,7 +9,6 @@ llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_grap
const uint32_t n_embd_head_qk_rope = hparams.n_rot; const uint32_t n_embd_head_qk_rope = hparams.n_rot;
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
const uint32_t kv_lora_rank = hparams.n_lora_kv; const uint32_t kv_lora_rank = hparams.n_lora_kv;
ggml_tensor * cur; ggml_tensor * cur;

View File

@@ -167,10 +167,6 @@ struct llm_build_exaone : public llm_graph_context {
llm_build_exaone(const llama_model & model, const llm_graph_params & params); llm_build_exaone(const llama_model & model, const llm_graph_params & params);
}; };
struct llm_build_exaone_moe : public llm_graph_context {
llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_falcon : public llm_graph_context { struct llm_build_falcon : public llm_graph_context {
llm_build_falcon(const llama_model & model, const llm_graph_params & params); llm_build_falcon(const llama_model & model, const llm_graph_params & params);
}; };
@@ -307,7 +303,6 @@ struct llm_build_llada_moe : public llm_graph_context {
llm_build_llada_moe(const llama_model & model, const llm_graph_params & params); llm_build_llada_moe(const llama_model & model, const llm_graph_params & params);
}; };
template <bool embed>
struct llm_build_llama : public llm_graph_context { struct llm_build_llama : public llm_graph_context {
llm_build_llama(const llama_model & model, const llm_graph_params & params); llm_build_llama(const llama_model & model, const llm_graph_params & params);
}; };
@@ -316,18 +311,10 @@ struct llm_build_llama_iswa : public llm_graph_context {
llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params); llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params);
}; };
struct llm_build_maincoder : public llm_graph_context {
llm_build_maincoder(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_mamba : public llm_graph_context_mamba { struct llm_build_mamba : public llm_graph_context_mamba {
llm_build_mamba(const llama_model & model, const llm_graph_params & params); llm_build_mamba(const llama_model & model, const llm_graph_params & params);
}; };
struct llm_build_mimo2_iswa : public llm_graph_context {
llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_minicpm3 : public llm_graph_context { struct llm_build_minicpm3 : public llm_graph_context {
llm_build_minicpm3(const llama_model & model, const llm_graph_params & params); llm_build_minicpm3(const llama_model & model, const llm_graph_params & params);
}; };
@@ -340,10 +327,6 @@ struct llm_build_mistral3 : public llm_graph_context {
llm_build_mistral3(const llama_model & model, const llm_graph_params & params); llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
}; };
struct llm_build_modern_bert : public llm_graph_context {
llm_build_modern_bert(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_mpt : public llm_graph_context { struct llm_build_mpt : public llm_graph_context {
llm_build_mpt(const llama_model & model, const llm_graph_params & params); llm_build_mpt(const llama_model & model, const llm_graph_params & params);
}; };
@@ -413,11 +396,6 @@ struct llm_build_plamo : public llm_graph_context {
llm_build_plamo(const llama_model & model, const llm_graph_params & params); llm_build_plamo(const llama_model & model, const llm_graph_params & params);
}; };
template <bool iswa>
struct llm_build_plamo3 : public llm_graph_context {
llm_build_plamo3(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_plm : public llm_graph_context { struct llm_build_plm : public llm_graph_context {
llm_build_plm(const llama_model & model, const llm_graph_params & params); llm_build_plm(const llama_model & model, const llm_graph_params & params);
}; };
@@ -470,8 +448,7 @@ private:
ggml_tensor * cur, ggml_tensor * cur,
int il); int il);
// returns pair of output and new state ggml_tensor * build_delta_net_chunking(
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking(
ggml_tensor * q, ggml_tensor * q,
ggml_tensor * k, ggml_tensor * k,
ggml_tensor * v, ggml_tensor * v,
@@ -483,8 +460,7 @@ private:
ggml_tensor * diag_mask, ggml_tensor * diag_mask,
int il); int il);
// returns pair of output and new state ggml_tensor * build_delta_net_autoregressive(
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive(
ggml_tensor * q, ggml_tensor * q,
ggml_tensor * k, ggml_tensor * k,
ggml_tensor * v, ggml_tensor * v,
@@ -499,11 +475,6 @@ private:
ggml_tensor * gate, ggml_tensor * gate,
int layer); int layer);
// returns pair of qkv, z
std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
ggml_tensor * input,
int il);
const llama_model & model; const llama_model & model;
}; };

View File

@@ -1,116 +0,0 @@
#include "models.h"
llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
ggml_tensor * cur;
ggml_tensor * inpL;
ggml_tensor * inp_pos = build_inp_pos();
// construct input embeddings (token, type, position)
inpL = build_inp_embd(model.tok_embd);
cb(inpL, "inp_embd", -1);
// embed layer norm
inpL = build_norm(inpL, model.tok_norm, nullptr, LLM_NORM, -1);
cb(inpL, "inp_norm", -1);
ggml_tensor * inp_out_ids = build_inp_out_ids();
auto * inp_attn = build_attn_inp_no_cache();
for (int il = 0; il < n_layer; ++il) {
const float freq_base_l = model.get_rope_freq_base(cparams, il);
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
cur = inpL;
// attention layer norm
if (model.layers[il].attn_norm) {
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
LLM_NORM, il);
cb(cur, "attn_norm", il);
}
// self attention
cur = build_lora_mm(model.layers[il].wqkv, cur);
cb(cur, "wqkv", il);
const size_t type_size = ggml_type_size(cur->type);
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*type_size, cur->nb[1], 0*type_size*(n_embd));
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd));
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd + n_embd_gqa));
// RoPE
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, nullptr,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
cb(cur, "kqv_out", il);
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
}
// re-add the layer input
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
cb(ffn_inp, "ffn_inp", il);
// attention layer norm
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
LLM_NORM, il);
cb(cur, "ffn_norm", il);
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
NULL, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_GEGLU, LLM_FFN_SEQ, il);
// attentions bypass the intermediate layer
cur = ggml_add(ctx0, cur, ffn_inp);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = build_norm(cur,
model.output_norm, NULL,
LLM_NORM, -1);
cb(cur, "final_norm_out", -1);
if (hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
// extracting cls token
cur = ggml_view_1d(ctx0, cur, hparams.n_embd, 0);
cb(cur, "cls_pooled_embd", -1);
}
cb(cur, "res_embd", -1);
res->t_embd = cur;
ggml_build_forward_expand(gf, cur);
}

View File

@@ -67,7 +67,7 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
const llama_model & model, const llama_model & model,
const int64_t n_embd_head, const int64_t n_embd_head,
const int il) { const int il) {
// compute Q and K // compute Q and K and (optionally) RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
if (model.layers[il].bq) { if (model.layers[il].bq) {

View File

@@ -14,9 +14,6 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
ggml_tensor * inp_out_ids = build_inp_out_ids(); ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
const float freq_base_l = model.get_rope_freq_base (cparams, il);
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
ggml_tensor * inpSA = inpL; ggml_tensor * inpSA = inpL;
// norm // norm
@@ -52,13 +49,13 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
Qcur = ggml_rope_ext( Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr, ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
Kcur = ggml_rope_ext( Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr, ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );

View File

@@ -1,128 +0,0 @@
#include "models.h"
template <bool iswa>
llm_build_plamo3<iswa>::llm_build_plamo3(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t head_dim_q = hparams.n_embd_head_k;
const int64_t head_dim_v = hparams.n_embd_head_v;
ggml_tensor * cur;
ggml_tensor * inpL = build_inp_embd(model.tok_embd);
ggml_tensor * inp_pos = build_inp_pos();
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
inp_attn_type * inp_attn = nullptr;
if constexpr (iswa) {
inp_attn = build_attn_inp_kv_iswa();
} else {
inp_attn = build_attn_inp_kv();
}
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * residual = inpL;
float freq_base_l = 0.0f;
float freq_scale_l = 0.0f;
if constexpr (iswa) {
freq_base_l = model.get_rope_freq_base (cparams, il);
freq_scale_l = model.get_rope_freq_scale(cparams, il);
} else {
freq_base_l = freq_base;
freq_scale_l = freq_scale;
}
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
cb(cur, "wqkv", il);
const int32_t n_head = hparams.n_head(il);
const int32_t n_head_kv = hparams.n_head_kv(il);
const int64_t q_offset = 0;
const int64_t k_offset = head_dim_q * n_head;
const int64_t v_offset = k_offset + head_dim_q * n_head_kv;
ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, head_dim_q, n_head, n_tokens,
head_dim_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, head_dim_q, n_head_kv, n_tokens,
head_dim_q * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, head_dim_v, n_head_kv, n_tokens,
head_dim_v * sizeof(float), qkv->nb[1], v_offset * ggml_element_size(qkv));
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
cb(Qcur, "attn_q_norm", il);
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
cb(Kcur, "attn_k_norm", il);
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
ext_factor, attn_factor, beta_fast, beta_slow);
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
ext_factor, attn_factor, beta_fast, beta_slow);
const float attn_scale = 1.0f / sqrtf(float(head_dim_q));
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, attn_scale, il);
cb(cur, "attn_out", il);
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
}
cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "attn_post_norm", il);
cur = ggml_add(ctx0, cur, residual);
cb(cur, "attn_residual", il);
residual = cur;
cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
NULL, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
cb(cur, "ffn_out", il);
cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "ffn_post_norm", il);
cur = ggml_add(ctx0, cur, residual);
cb(cur, "ffn_residual", il);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
inpL = cur;
}
cur = inpL;
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
res->t_embd = cur;
cur = build_lora_mm(model.output, cur);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}
// Explicit template instantiations
template struct llm_build_plamo3<false>;
template struct llm_build_plamo3<true>;

View File

@@ -5,7 +5,6 @@ llm_build_plm::llm_build_plm(const llama_model & model, const llm_graph_params &
const uint32_t n_embd_head_qk_rope = hparams.n_rot; const uint32_t n_embd_head_qk_rope = hparams.n_rot;
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
const uint32_t kv_lora_rank = hparams.n_lora_kv; const uint32_t kv_lora_rank = hparams.n_lora_kv;
ggml_tensor * cur; ggml_tensor * cur;

View File

@@ -86,15 +86,7 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
} }
// utility to get one slice from the third dimension ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
// input dim: [x, y, c, b]
// output dim: [x, y, 1, b]
static ggml_tensor * get_slice_2d(ggml_context * ctx0, ggml_tensor * t, int64_t c) {
return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3],
t->nb[1], t->nb[2], t->nb[3], t->nb[2] * c);
}
std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chunking(
ggml_tensor * q, ggml_tensor * q,
ggml_tensor * k, ggml_tensor * k,
ggml_tensor * v, ggml_tensor * v,
@@ -195,16 +187,18 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chu
beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs); beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g); ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
cb(g_cumsum, "g_cumsum", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
ggml_tensor * gcs_i = g_cumsum; // ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs); cb(g_cumsum, "g_cumsum", il);
ggml_tensor * gcs_i = ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs); ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
ggml_tensor * gcs_j_broadcast = ggml_tensor * gcs_j_broadcast =
ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs); ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i); ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
cb(decay_mask, "decay_mask", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
cb(decay_mask, "decay_mask", il);
decay_mask = ggml_mul(ctx0, decay_mask, diag_mask); decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
decay_mask = ggml_exp(ctx0, decay_mask); decay_mask = ggml_exp(ctx0, decay_mask);
@@ -214,7 +208,8 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chu
ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask); ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask)); ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
cb(attn, "attn_pre_solve", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
cb(attn, "attn_pre_solve", il);
ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask); ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower); ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
@@ -222,7 +217,8 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chu
ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false); ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
attn = ggml_mul(ctx0, lin_solve, causal_mask); attn = ggml_mul(ctx0, lin_solve, causal_mask);
attn = ggml_add(ctx0, attn, identity); attn = ggml_add(ctx0, attn, identity);
cb(attn, "attn_solved", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
cb(attn, "attn_solved", il);
v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn); v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);
@@ -230,126 +226,116 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chu
ggml_tensor * gexp = ggml_exp(ctx0, g_cumsum_t); ggml_tensor * gexp = ggml_exp(ctx0, g_cumsum_t);
ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp); ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp);
cb(kbeta_gexp, "kbeta_gexp", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
cb(kbeta_gexp, "kbeta_gexp", il);
ggml_tensor * k_cumdecay = ggml_tensor * k_cumdecay =
ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp))))); ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
cb(k_cumdecay, "k_cumdecay", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
ggml_tensor * attn_kq = ggml_mul_mat(ctx0, k, q); cb(k_cumdecay, "k_cumdecay", il);
attn_kq = ggml_mul(ctx0, attn_kq, decay_mask);
attn_kq = ggml_mul(ctx0, attn_kq, diag_mask);
cb(attn_kq, "attn_kq", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
// vectorized calculation of key_gdiff
// improved from the chunked version:
// g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
// g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
// key_gdiff = key * g_diff.unsqueeze(-1)
// kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
// get last element in g_cumsum along chunk_size dimension (ne0)
// example: [[x, y, z, ..., last], ...] -> [[last], ...]
ggml_tensor * g_last = ggml_view_4d(ctx0, g_cumsum, 1, 1, g_cumsum->ne[2], g_cumsum->ne[3],
g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3],
(g_cumsum->ne[0] - 1) * ggml_element_size(g_cumsum));
g_last = ggml_cont(ctx0, g_last);
cb(g_last, "g_last", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
ggml_tensor * g_last_exp = ggml_exp(ctx0, g_last);
cb(g_last_exp, "g_last_exp", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum, g_last));
cb(g_diff, "g_diff", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp);
cb(key_gdiff, "key_gdiff", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
// state to be updated per chunk
ggml_tensor * new_state = state; // ggml_dup(ctx0, state);
cb(new_state, "new_state", il); // shape: (S_v, S_v, H_v, n_seqs)
// shape after loop of chunks: (S_v, chunk_size, n_chunks, H_v * n_seqs)
ggml_tensor * core_attn_out = nullptr; ggml_tensor * core_attn_out = nullptr;
ggml_tensor * new_state = ggml_dup(ctx0, state);
cb(new_state, "new_state", il);
for (int64_t chunk = 0; chunk < n_chunks; chunk++) { for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
// shape: (S_k, chunk_size, 1, H_k * n_seqs) auto chunkify = [=](ggml_tensor * t) {
ggml_tensor * q_chunk = get_slice_2d(ctx0, q, chunk); // (no cont), next op: ggml_mul return ggml_cont(ctx0, ggml_view_4d(ctx0, t, t->ne[0], chunk_size, 1, t->ne[3],
t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk));
};
// shape: (S_v, chunk_size, 1, H_v * n_seqs) auto chunkify_g = [=](ggml_tensor * t) {
ggml_tensor * v_chunk = get_slice_2d(ctx0, v, chunk); // (no cont), next op: ggml_repeat return ggml_cont(ctx0, ggml_view_4d(ctx0, t, chunk_size, t->ne[1], 1, t->ne[3],
t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk));
};
// shape: (chunk_size, 1, n_chunks, H_v * n_seqs) ggml_tensor * k_chunk = chunkify(k);
ggml_tensor * gexp_chunk = get_slice_2d(ctx0, gexp, chunk); // (no cont), next op: ggml_mul ggml_tensor * q_chunk = chunkify(q);
ggml_tensor * v_chunk = chunkify(v);
// shape: (chunk_size, 1, H_v * n_seqs) ggml_tensor * g_cs_chunk = chunkify_g(g_cumsum);
ggml_tensor * k_cumdecay_chunk = get_slice_2d(ctx0, k_cumdecay, chunk); // (no cont), next op: ggml_mul_mat ggml_tensor * g_cs_chunk_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cs_chunk));
ggml_tensor * decay_mask_chunk = chunkify(decay_mask);
ggml_tensor * k_cumdecay_chunk = chunkify(k_cumdecay);
ggml_tensor * gexp_chunk = ggml_exp(ctx0, g_cs_chunk_t);
// attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0) // attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
// replaced by precomputed attn_kq attn = ggml_mul_mat(ctx0, k_chunk, q_chunk);
ggml_tensor * attn_chunk = get_slice_2d(ctx0, attn_kq, chunk); attn = ggml_mul(ctx0, attn, decay_mask_chunk);
cb(attn_chunk, "attn_chunk", il); attn = ggml_mul(ctx0, attn, diag_mask);
ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs); ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
// v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk); ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk);
cb(v_prime, "v_prime_chunk", il); // shape: (S_v, 1, H_v * n_seqs)
// v_new = v_i - v_prime // v_new = v_i - v_prime
ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v_chunk, v_prime), v_prime); ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v_chunk, v_prime), v_prime);
ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new)); ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
cb(v_new, "v_new_chunk", il);
// attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
ggml_tensor * q_g_exp = ggml_mul(ctx0, q_chunk, gexp_chunk); ggml_tensor * q_g_exp = ggml_mul(ctx0, q_chunk, gexp_chunk);
ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp); ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
cb(attn_inter, "attn_inter_chunk", il);
// core_attn_out[:, :, i] = attn_inter + attn @ v_new // core_attn_out[:, :, i] = attn_inter + attn @ v_new
ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn_chunk); ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn);
cb(v_attn, "v_attn_chunk", il);
ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn); ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn);
cb(core_attn_out_chunk, "core_attn_out_chunk", il); // shape: (S_v, chunk_size, 1, H_v * n_seqs)
core_attn_out = core_attn_out == nullptr core_attn_out = core_attn_out == nullptr ? core_attn_out_chunk : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 1);
? core_attn_out_chunk
: ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 2);
// g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
// g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
// key_gdiff = key * g_diff.unsqueeze(-1)
// kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
ggml_tensor * k_gdiff = ggml_cont(ctx0, get_slice_2d(ctx0, key_gdiff, chunk));
//ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, k_gdiff, v_new); // this is slower on metal, why?
ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, k_gdiff)));
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
ggml_tensor * gexp_last_chunk = ggml_cont(ctx0, get_slice_2d(ctx0, g_last_exp, chunk));
ggml_tensor * g_cum_last =
ggml_cont(ctx0, ggml_view_4d(ctx0, g_cs_chunk_t, g_cs_chunk_t->ne[0], 1, g_cs_chunk_t->ne[2], g_cs_chunk_t->ne[3],
g_cs_chunk_t->nb[1], g_cs_chunk_t->nb[2], g_cs_chunk_t->nb[3],
g_cs_chunk_t->nb[0] * (g_cs_chunk_t->ne[1] - 1)));
ggml_tensor * gexp_last =
ggml_reshape_4d(ctx0, ggml_exp(ctx0, g_cum_last), 1, 1, g_cum_last->ne[0] * g_cum_last->ne[2], g_cum_last->ne[3]);
ggml_tensor * g_cum_last_3d =
ggml_reshape_3d(ctx0, g_cum_last, g_cum_last->ne[0], g_cum_last->ne[2], g_cum_last->ne[3]);
ggml_tensor * g_cumsum_3d = ggml_reshape_3d(ctx0, g_cs_chunk, g_cs_chunk->ne[0], g_cs_chunk->ne[2], g_cs_chunk->ne[3]);
ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum_3d, g_cum_last_3d));
ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
ggml_tensor * key_gdiff = ggml_mul(ctx0, k_chunk,
ggml_reshape_4d(ctx0, g_diff_exp, 1, g_diff_exp->ne[0], g_diff_exp->ne[1],
g_diff_exp->ne[2] * g_diff_exp->ne[3]));
ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff)));
new_state = ggml_add(ctx0, new_state = ggml_add(ctx0,
ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gexp_last_chunk, gexp_last_chunk->ne[0], gexp_last_chunk->ne[1], H_v, n_seqs)), ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gexp_last, gexp_last->ne[0], gexp_last->ne[1], H_v, n_seqs)),
ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs)); ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs));
} }
// truncate padded tokens core_attn_out = ggml_cont_4d(ctx0, core_attn_out, S_v, chunk_size * n_chunks, H_v, n_seqs);
ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out,
S_v, n_tokens, H_v, n_seqs, ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out, S_v, n_tokens, H_v, n_seqs, core_attn_out->nb[1], core_attn_out->nb[2], core_attn_out->nb[3], 0);
ggml_row_size(core_attn_out->type, S_v),
ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks),
ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0);
output_tokens = ggml_cont(ctx0, output_tokens);
cb(output_tokens, "output_tokens", il); cb(output_tokens, "output_tokens", il);
// permute back to (S_v, H_v, n_tokens, n_seqs) // flatten output
output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3); ggml_tensor * flat_output =
output_tokens = ggml_cont(ctx0, output_tokens); ggml_cont_1d(ctx0, ggml_permute(ctx0, output_tokens, 0, 2, 1, 3), S_v * H_v * n_tokens * n_seqs);
return {output_tokens, new_state}; ggml_tensor * flat_state = ggml_cont_1d(ctx0, new_state, S_v * S_v * H_v * n_seqs);
return ggml_concat(ctx0, flat_output, flat_state, 0);
} }
std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_autoregressive( ggml_tensor * llm_build_qwen3next::build_delta_net_autoregressive(
ggml_tensor * q, ggml_tensor * q,
ggml_tensor * k, ggml_tensor * k,
ggml_tensor * v, ggml_tensor * v,
@@ -433,7 +419,11 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_aut
cb(core_attn_out, "output_tokens", il); cb(core_attn_out, "output_tokens", il);
cb(state, "new_state", il); cb(state, "new_state", il);
return {core_attn_out, state}; // flatten output, no need to permute since n_tokens is 1 so [S_v, 1, H_v, n_seqs] and [S_v, H_v, 1, n_seqs] are equivalent memory-layout wise
ggml_tensor * flat_output = ggml_reshape_1d(ctx0, core_attn_out, S_v * H_v * n_tokens * n_seqs);
ggml_tensor * flat_state = ggml_reshape_1d(ctx0, state, S_v * S_v * H_v * n_seqs);
return ggml_concat(ctx0, flat_output, flat_state, 0);
} }
ggml_tensor * llm_build_qwen3next::build_norm_gated( ggml_tensor * llm_build_qwen3next::build_norm_gated(
@@ -533,88 +523,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn(
return cur; return cur;
} }
std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_qkvz(
ggml_tensor * input,
int il) {
const int64_t d_inner = hparams.ssm_d_inner;
const int64_t n_seqs = ubatch.n_seqs;
const int64_t head_k_dim = hparams.ssm_d_state;
const int64_t num_k_heads = hparams.ssm_n_group;
const int64_t num_v_heads = hparams.ssm_dt_rank;
const int64_t head_v_dim = d_inner / num_v_heads;
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
if (model.layers[il].wqkv) {
// optimized path
ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input);
qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_seq_tokens, n_seqs);
cb(qkv_mixed, "linear_attn_qkv_mixed", il);
ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input);
cb(z, "z", il);
return { qkv_mixed, z };
} else {
// legacy (slower) path
ggml_tensor * mixed_qkvz = build_lora_mm(model.layers[il].ssm_in, input);
cb(mixed_qkvz, "linear_attn_mixed_qkvz", il);
int64_t qkvz_new_dim = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads);
ggml_tensor * mixed_qkvz_reshaped = ggml_reshape_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs);
// Split mixed_qkvz into query, key, value, z
int64_t split_sizes_qkvz[4] = {
head_k_dim, // query size
head_k_dim, // key size
head_v_dim * num_v_heads / num_k_heads, // value size
head_v_dim * num_v_heads / num_k_heads // z size
};
ggml_tensor * query =
ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[0], num_k_heads, n_seq_tokens, n_seqs,
mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3], 0);
cb(query, "q", il);
ggml_tensor * key = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[1], num_k_heads, n_seq_tokens, n_seqs,
mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
split_sizes_qkvz[0] * ggml_element_size(mixed_qkvz_reshaped));
cb(key, "k", il);
ggml_tensor * value =
ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[2], num_k_heads, n_seq_tokens, n_seqs,
mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
(split_sizes_qkvz[0] + split_sizes_qkvz[1]) * ggml_element_size(mixed_qkvz_reshaped));
cb(value, "v", il);
ggml_tensor * z = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[3], num_k_heads, n_seq_tokens, n_seqs,
mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
(split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * ggml_element_size(mixed_qkvz_reshaped));
z = ggml_cont(ctx0, z);
cb(z, "z", il);
// After creating query, key, and value_reshaped, reshape each to flatten the head dimensions
// query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
ggml_tensor * query_flat = ggml_cont_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
cb(query_flat, "query_flat", il);
// key: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
ggml_tensor * key_flat = ggml_cont_3d(ctx0, key, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
cb(key_flat, "key_flat", il);
// value_reshaped: [head_v_dim, num_v_heads, n_tokens, n_seqs] -> [head_v_dim * num_v_heads, n_tokens, n_seqs]
ggml_tensor * value_flat = ggml_cont_3d(ctx0, value, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
cb(value_flat, "value_flat", il);
// Now concatenate along the feature dimension (dim 0) to get [conv_dim, n_tokens, n_seqs]
ggml_tensor * qkv_mixed = ggml_concat(ctx0, query_flat, key_flat, 0);
qkv_mixed = ggml_concat(ctx0, qkv_mixed, value_flat, 0);
cb(qkv_mixed, "qkv_mixed", il);
return { qkv_mixed, z };
}
}
ggml_tensor * llm_build_qwen3next::build_layer_attn_linear( ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
llm_graph_input_rs * inp, llm_graph_input_rs * inp,
ggml_tensor * cur, ggml_tensor * cur,
@@ -639,13 +547,15 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
// Input projections // Input projections
auto qkvz = build_qkvz(cur, il); ggml_tensor * mixed_qkvz = build_lora_mm(model.layers[il].ssm_in, cur);
ggml_tensor * qkv_mixed = qkvz.first; cb(mixed_qkvz, "linear_attn_mixed_qkvz", il);
ggml_tensor * z = qkvz.second;
ggml_tensor * mixed_ba = build_lora_mm(model.layers[il].ssm_beta_alpha, cur); ggml_tensor * mixed_ba = build_lora_mm(model.layers[il].ssm_beta_alpha, cur);
cb(mixed_ba, "linear_attn_mixed_ba", il); cb(mixed_ba, "linear_attn_mixed_ba", il);
int64_t qkvz_new_dim = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads);
ggml_tensor * mixed_qkvz_reshaped = ggml_reshape_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs);
// Reshape mixed_ba: [batch, seq_len, hidden_size] -> [batch, seq_len, num_k_heads, 2*num_v_heads/num_k_heads] // Reshape mixed_ba: [batch, seq_len, hidden_size] -> [batch, seq_len, num_k_heads, 2*num_v_heads/num_k_heads]
int64_t ba_new_dim = 2 * num_v_heads / num_k_heads; int64_t ba_new_dim = 2 * num_v_heads / num_k_heads;
ggml_tensor * mixed_ba_reshaped = ggml_reshape_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs); ggml_tensor * mixed_ba_reshaped = ggml_reshape_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs);
@@ -665,9 +575,8 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
split_sizes_ba[0] * ggml_element_size(mixed_ba_reshaped)); split_sizes_ba[0] * ggml_element_size(mixed_ba_reshaped));
cb(a, "a", il); cb(a, "a", il);
ggml_tensor * beta = ggml_cont_4d(ctx0, b, num_v_heads, 1, n_seq_tokens, n_seqs); // Reshape b and a to merge head dimensions: [batch, seq_len, num_k_heads, num_v_heads/num_k_heads] -> [batch, seq_len, num_v_heads]
ggml_tensor * beta = ggml_cont_3d(ctx0, b, num_v_heads, n_seq_tokens, n_seqs);
// Reshape a to merge head dimensions: [batch, seq_len, num_k_heads, num_v_heads/num_k_heads] -> [batch, seq_len, num_v_heads]
ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs); ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs);
ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt); ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
@@ -676,6 +585,48 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); // -A_log.exp() * softplus ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); // -A_log.exp() * softplus
cb(gate, "gate", il); cb(gate, "gate", il);
// Split mixed_qkvz into query, key, value, z
int64_t split_sizes_qkvz[4] = {
head_k_dim, // query size
head_k_dim, // key size
head_v_dim * num_v_heads / num_k_heads, // value size
head_v_dim * num_v_heads / num_k_heads // z size
};
ggml_tensor * query =
ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[0], num_k_heads, n_seq_tokens, n_seqs,
mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3], 0);
cb(query, "q", il);
ggml_tensor * key = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[1], num_k_heads, n_seq_tokens, n_seqs,
mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
split_sizes_qkvz[0] * sizeof(float));
cb(key, "k", il);
ggml_tensor * value =
ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[2], num_k_heads, n_seq_tokens, n_seqs,
mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
(split_sizes_qkvz[0] + split_sizes_qkvz[1]) * sizeof(float));
cb(value, "v", il);
ggml_tensor * z = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[3], num_k_heads, n_seq_tokens, n_seqs,
mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
(split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * sizeof(float));
cb(z, "z", il);
// After creating query, key, and value_reshaped, reshape each to flatten the head dimensions
// query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
ggml_tensor * query_flat = ggml_cont_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
cb(query_flat, "query_flat", il);
// key: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
ggml_tensor * key_flat = ggml_cont_3d(ctx0, key, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
cb(key_flat, "key_flat", il);
// value_reshaped: [head_v_dim, num_v_heads, n_tokens, n_seqs] -> [head_v_dim * num_v_heads, n_tokens, n_seqs]
ggml_tensor * value_flat = ggml_cont_3d(ctx0, value, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
cb(value_flat, "value_flat", il);
// Get convolution states from cache // Get convolution states from cache
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
@@ -686,6 +637,17 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs); ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
cb(conv_states, "conv_states", il); cb(conv_states, "conv_states", il);
// Now concatenate along the feature dimension (dim 0) to get [conv_dim, n_tokens, n_seqs]
ggml_tensor * qkv_mixed = ggml_concat(ctx0, query_flat, key_flat, 0);
qkv_mixed = ggml_concat(ctx0, qkv_mixed, value_flat, 0);
cb(qkv_mixed, "qkv_mixed", il);
qkv_mixed = ggml_permute(ctx0, qkv_mixed, 1, 0, 2, 3);
cb(qkv_mixed, "qkv_mixed_permuted", il);
// Calculate the total conv dimension
int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads;
// Calculate convolution kernel size // Calculate convolution kernel size
ggml_tensor * conv_kernel = model.layers[il].ssm_conv1d; ggml_tensor * conv_kernel = model.layers[il].ssm_conv1d;
const int64_t conv_kernel_size = conv_kernel->ne[0]; const int64_t conv_kernel_size = conv_kernel->ne[0];
@@ -693,9 +655,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs); conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
cb(conv_states, "conv_states_reshaped", il); cb(conv_states, "conv_states_reshaped", il);
qkv_mixed = ggml_permute(ctx0, qkv_mixed, 1, 0, 2, 3);
cb(qkv_mixed, "qkv_mixed_permuted", il);
ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0); ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0);
cb(conv_input, "conv_input", il); cb(conv_input, "conv_input", il);
@@ -718,25 +677,26 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel); ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel);
cb(conv_output_proper, "conv_output_raw", il); cb(conv_output_proper, "conv_output_raw", il);
conv_output_proper = ggml_cont(ctx0, ggml_transpose(ctx0, conv_output_proper));
cb(conv_output_proper, "conv_output_pre_silu", il);
ggml_tensor * conv_output_silu = ggml_silu(ctx0, conv_output_proper); ggml_tensor * conv_output_silu = ggml_silu(ctx0, conv_output_proper);
cb(conv_output_silu, "conv_output_silu", il); cb(conv_output_silu, "conv_output_silu", il);
ggml_tensor * conv_qkv_mix = conv_output_silu; ggml_tensor * conv_qkv_mix =
ggml_cont_2d(ctx0, ggml_transpose(ctx0, conv_output_silu), qkv_dim, n_seq_tokens * n_seqs);
// Calculate the total conv dimension cb(conv_qkv_mix, "conv_qkv_mix", il);
int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads;
int64_t nb1_qkv = ggml_row_size(conv_qkv_mix->type, qkv_dim);
// Extract the convolved Q, K, V from conv_output // Extract the convolved Q, K, V from conv_output
ggml_tensor * q_conv = ggml_tensor * q_conv =
ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv, 0); ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, conv_qkv_mix->nb[1], 0);
cb(q_conv, "q_conv", il); cb(q_conv, "q_conv", il);
ggml_tensor * k_conv = ggml_tensor * k_conv =
ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv, ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, conv_qkv_mix->nb[1],
head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix)); head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
cb(k_conv, "k_conv", il); cb(k_conv, "k_conv", il);
ggml_tensor * v_conv = ggml_tensor * v_conv =
ggml_view_2d(ctx0, conv_qkv_mix, head_v_dim * num_v_heads, n_seq_tokens * n_seqs, nb1_qkv, ggml_view_2d(ctx0, conv_qkv_mix, head_v_dim * num_v_heads, n_seq_tokens * n_seqs, conv_qkv_mix->nb[1],
2 * head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix)); 2 * head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
cb(v_conv, "v_conv", il); cb(v_conv, "v_conv", il);
@@ -745,6 +705,8 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs); k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs); v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
beta = ggml_cont_4d(ctx0, b, num_v_heads, 1, n_seq_tokens, n_seqs);
ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs); ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim * num_v_heads, 1, n_seqs); state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim * num_v_heads, 1, n_seqs);
cb(state, "state_predelta", il); cb(state, "state_predelta", il);
@@ -776,29 +738,45 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
cb(v_conv, "v_conv_predelta", il); cb(v_conv, "v_conv_predelta", il);
// Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens // Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens
std::pair<ggml_tensor *, ggml_tensor *> attn_out; // pair of (output, new_state) ggml_tensor * attn_out;
if (n_seq_tokens == 1) { if (n_seq_tokens == 1) {
attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il); attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
} else { } else {
attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il); attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il);
} }
ggml_tensor * output = attn_out.first; cb(attn_out, "attn_out", il);
ggml_tensor * new_state = attn_out.second;
cb(output, "attn_output", il); // The tensors were concatenated 1d, so we need to extract them 1d as well
cb(new_state, "new_state", il); const int64_t output_flat_size = head_v_dim * num_v_heads * n_seq_tokens * n_seqs;
ggml_tensor * attn_out_1d = ggml_view_1d(ctx0, attn_out, output_flat_size, 0);
cb(attn_out_1d, "attn_out_1d", il);
ggml_tensor * attn_out_final = ggml_cont_4d(ctx0, attn_out_1d, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
cb(attn_out_final, "attn_out_reshaped", il);
// Extract the state part (second part of the concatenated tensor)
// State starts after n_tokens elements along dimension 1
const int64_t state_flat_size = head_v_dim * head_v_dim * num_v_heads * n_seqs;
ggml_tensor * state_1d =
ggml_view_1d(ctx0, attn_out, state_flat_size, output_flat_size * ggml_element_size(attn_out));
cb(state_1d, "state_1d", il);
// Update the recurrent states // Update the recurrent states
ggml_build_forward_expand(gf, ggml_build_forward_expand(gf,
ggml_cpy(ctx0, new_state, ggml_cpy(ctx0, state_1d,
ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs, ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all)))); kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
GGML_ASSERT(ggml_nelements(attn_out_1d) + ggml_nelements(state_1d) == ggml_nelements(attn_out));
// Reshape both attn_out_final and z to 2D tensors for normalization // Reshape both attn_out_final and z to 2D tensors for normalization
// attn_out_final: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim] // attn_out_final: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
ggml_tensor * attn_out_2d_final = ggml_reshape_2d(ctx0, output, head_v_dim, num_v_heads * n_seq_tokens * n_seqs); ggml_tensor * attn_out_2d_final =
ggml_cont_2d(ctx0, attn_out_final, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
// z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim] // z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
ggml_tensor * z_2d = ggml_reshape_2d(ctx0, z, head_v_dim, num_v_heads * n_seq_tokens * n_seqs); ggml_tensor * z_2d = ggml_cont_2d(ctx0, z, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
// Apply gated normalization: self.norm(core_attn_out, z) // Apply gated normalization: self.norm(core_attn_out, z)
ggml_tensor * attn_out_norm = build_norm_gated(attn_out_2d_final, model.layers[il].ssm_norm, z_2d, il); ggml_tensor * attn_out_norm = build_norm_gated(attn_out_2d_final, model.layers[il].ssm_norm, z_2d, il);
@@ -850,6 +828,12 @@ ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const int
shared_gate = ggml_sigmoid(ctx0, shared_gate); shared_gate = ggml_sigmoid(ctx0, shared_gate);
cb(shared_gate, "shared_expert_gate_sigmoid", il); cb(shared_gate, "shared_expert_gate_sigmoid", il);
// The gate needs to be broadcast to match the dimensions of ffn_shexp
// ffn_shexp is [n_embd, n_tokens, 1, 1] and shared_gate is [1, n_tokens, 1, 1]
// We need to repeat the gate along the feature dimension
shared_gate = ggml_repeat(ctx0, shared_gate, ffn_shexp);
cb(shared_gate, "shared_expert_gate_broadcast", il);
// Apply the gate to the shared expert output // Apply the gate to the shared expert output
ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate); ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate);
cb(ffn_shexp, "ffn_shexp_gated", il); cb(ffn_shexp, "ffn_shexp_gated", il);

View File

@@ -2,8 +2,7 @@
llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const size_t n_deepstack_layers = hparams.n_deepstack_layers; const size_t n_deepstack_layers = hparams.n_deepstack_layers;
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -17,6 +16,17 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_
int sections[4]; int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
std::vector<ggml_tensor *> deepstack_features(n_deepstack_layers, nullptr);
if (ubatch.embd) {
// Image input: split main embd and deepstack embds
ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0);
for (size_t i = 0; i < n_deepstack_layers; i++) {
deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float));
}
inpL = inpL_main;
}
// inp_pos - contains the positions // inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_pos = build_inp_pos();
@@ -110,9 +120,8 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_
cur = build_cvec(cur, il); cur = build_cvec(cur, il);
cb(cur, "l_out", il); cb(cur, "l_out", il);
if (il < (int) n_deepstack_layers) { if (ubatch.embd && (size_t)il < n_deepstack_layers) {
ggml_tensor * ds = ggml_view_2d(ctx0, res->t_inp_embd, n_embd, n_tokens, res->t_inp_embd->nb[1], (il + 1) * n_embd * sizeof(float)); cur = ggml_add(ctx0, cur, deepstack_features[il]);
cur = ggml_add(ctx0, cur, ds);
cb(cur, "deepstack_out", il); cb(cur, "deepstack_out", il);
} }

View File

@@ -2,8 +2,7 @@
llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const size_t n_deepstack_layers = hparams.n_deepstack_layers; const size_t n_deepstack_layers = hparams.n_deepstack_layers;
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -17,6 +16,17 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_
int sections[4]; int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
std::vector<ggml_tensor *> deepstack_features(n_deepstack_layers, nullptr);
if (ubatch.embd) {
// Image input: split main embd and deepstack embds
ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0);
for (size_t i = 0; i < n_deepstack_layers; i++) {
deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float));
}
inpL = inpL_main;
}
// inp_pos - contains the positions // inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_pos = build_inp_pos();
@@ -103,9 +113,8 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_
cur = build_cvec(cur, il); cur = build_cvec(cur, il);
cb(cur, "l_out", il); cb(cur, "l_out", il);
if (il < (int) n_deepstack_layers) { if (ubatch.embd && (size_t)il < n_deepstack_layers) {
ggml_tensor * ds = ggml_view_2d(ctx0, res->t_inp_embd, n_embd, n_tokens, res->t_inp_embd->nb[1], (il + 1) * n_embd * sizeof(float)); cur = ggml_add(ctx0, cur, deepstack_features[il]);
cur = ggml_add(ctx0, cur, ds);
cb(cur, "deepstack_out", il); cb(cur, "deepstack_out", il);
} }

View File

@@ -26,16 +26,10 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
ggml_tensor * inp_out_ids = build_inp_out_ids(); ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
const float freq_base_l = model.get_rope_freq_base (cparams, il);
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
ggml_tensor * inpSA = inpL; ggml_tensor * inpSA = inpL;
ggml_tensor * probs = nullptr;
// This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
const bool use_rope = hparams.n_no_rope_layer_step == n_layer ||
il % hparams.n_no_rope_layer_step != 0;
ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
cb(probs, "ffn_moe_logits", il); cb(probs, "ffn_moe_logits", il);
// norm // norm
@@ -58,11 +52,11 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
if (use_rope) { if (hparams.n_no_rope_layer_step == n_layer || il % hparams.n_no_rope_layer_step != 0) {
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow); ext_factor, attn_factor, beta_fast, beta_slow);
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow); ext_factor, attn_factor, beta_fast, beta_slow);
} }
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);

View File

@@ -985,11 +985,6 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
{ "\\p{P}", unicode_cpt_flags::PUNCTUATION }, { "\\p{P}", unicode_cpt_flags::PUNCTUATION },
{ "\\p{M}", unicode_cpt_flags::ACCENT_MARK }, { "\\p{M}", unicode_cpt_flags::ACCENT_MARK },
{ "\\p{S}", unicode_cpt_flags::SYMBOL }, { "\\p{S}", unicode_cpt_flags::SYMBOL },
{ "\\p{Lu}", unicode_cpt_flags::LETTER }, // Uppercase letter
{ "\\p{Ll}", unicode_cpt_flags::LETTER }, // Lowercase letter
{ "\\p{Lt}", unicode_cpt_flags::LETTER }, // Titlecase letter
{ "\\p{Lm}", unicode_cpt_flags::LETTER }, // Modifier letter
{ "\\p{Lo}", unicode_cpt_flags::LETTER }, // Other letter
}; };
static const std::map<int, int> k_ucat_cpt = { static const std::map<int, int> k_ucat_cpt = {
@@ -1100,26 +1095,22 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
continue; continue;
} }
// Match \p{...} Unicode properties of varying lengths if (regex_expr[i + 0] == '\\' && i + 4 < regex_expr.size() &&
if (regex_expr[i + 0] == '\\' && i + 3 < regex_expr.size() &&
regex_expr[i + 1] == 'p' && regex_expr[i + 1] == 'p' &&
regex_expr[i + 2] == '{') { regex_expr[i + 2] == '{' &&
// Find the closing brace regex_expr[i + 4] == '}') {
size_t closing_brace = regex_expr.find('}', i + 3); const std::string pat = regex_expr.substr(i, 5);
if (closing_brace != std::string::npos && closing_brace <= i + 10) { // reasonable limit if (k_ucat_enum.find(pat) != k_ucat_enum.end()) {
const std::string pat = regex_expr.substr(i, closing_brace - i + 1); if (!inside) {
if (k_ucat_enum.find(pat) != k_ucat_enum.end()) { regex_expr_collapsed += '[';
if (!inside) {
regex_expr_collapsed += '[';
}
regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
if (!inside) {
regex_expr_collapsed += ']';
}
i = closing_brace;
continue;
} }
regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
if (!inside) {
regex_expr_collapsed += ']';
}
i += 4;
continue;
} }
} }

View File

@@ -32,6 +32,10 @@ struct clip_graph {
const float kq_scale; const float kq_scale;
const clip_flash_attn_type flash_attn_type; const clip_flash_attn_type flash_attn_type;
// for debugging
const bool debug_graph;
std::vector<ggml_tensor *> & debug_print_tensors;
ggml_context_ptr ctx0_ptr; ggml_context_ptr ctx0_ptr;
ggml_context * ctx0; ggml_context * ctx0;
ggml_cgraph * gf; ggml_cgraph * gf;

View File

@@ -45,14 +45,13 @@
#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size" #define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size"
#define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers" #define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers"
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution" #define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern" #define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
#define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes" #define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size" #define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version" #define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
// audio-specific // audio-specific
#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities #define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities
@@ -139,62 +138,6 @@
#define TN_TOK_BOI "v.boi" #define TN_TOK_BOI "v.boi"
#define TN_TOK_EOI "v.eoi" #define TN_TOK_EOI "v.eoi"
// (conformer) lfm2
#define TN_PRE_ENCODE_OUT "a.pre_encode.out.%s"
#define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s"
#define TN_FFN_NORM_1 "%s.blk.%d.ffn_norm_1.%s"
#define TN_FFN_UP_1 "%s.blk.%d.ffn_up_1.%s"
#define TN_FFN_DOWN_1 "%s.blk.%d.ffn_down_1.%s"
#define TN_POS_BIAS_U "%s.blk.%d.pos_bias_u"
#define TN_POS_BIAS_V "%s.blk.%d.pos_bias_v"
#define TN_NORM_CONV "%s.blk.%d.norm_conv.%s"
#define TN_LINEAR_POS "%s.blk.%d.linear_pos.%s"
#define TN_CONV_DW "%s.blk.%d.conv_dw.%s"
#define TN_CONV_NORM "%s.blk.%d.conv_norm.%s"
#define TN_CONV_PW1 "%s.blk.%d.conv_pw1.%s"
#define TN_CONV_PW2 "%s.blk.%d.conv_pw2.%s"
// mobilenetv5 (gemma3n) definitions
#define TN_MNV5_STEM_CONV "v.conv_stem.conv.weight"
#define TN_MNV5_STEM_BIAS "v.conv_stem.conv.bias"
#define TN_MNV5_STEM_BN "v.conv_stem.bn.weight"
// Stage 0 Block (Edge Residual)
#define TN_MNV5_BLK_S0_EXP_W "v.blk.%d.%d.conv_exp.weight"
#define TN_MNV5_BLK_S0_BN1_W "v.blk.%d.%d.bn1.weight"
#define TN_MNV5_BLK_S0_PWL_W "v.blk.%d.%d.conv_pwl.weight"
#define TN_MNV5_BLK_S0_BN2_W "v.blk.%d.%d.bn2.weight"
// Stage 1+ Block (Universal Inverted Residual)
#define TN_MNV5_BLK_DW_START_W "v.blk.%d.%d.dw_start.conv.weight"
#define TN_MNV5_BLK_DW_START_BN "v.blk.%d.%d.dw_start.bn.weight"
#define TN_MNV5_BLK_DW_MID_W "v.blk.%d.%d.dw_mid.conv.weight"
#define TN_MNV5_BLK_DW_MID_BN "v.blk.%d.%d.dw_mid.bn.weight"
#define TN_MNV5_BLK_PW_EXP_W "v.blk.%d.%d.pw_exp.conv.weight"
#define TN_MNV5_BLK_PW_EXP_BN "v.blk.%d.%d.pw_exp.bn.weight"
#define TN_MNV5_BLK_PW_PROJ_W "v.blk.%d.%d.pw_proj.conv.weight"
#define TN_MNV5_BLK_PW_PROJ_BN "v.blk.%d.%d.pw_proj.bn.weight"
#define TN_MNV5_BLK_LAYER_SCALE "v.blk.%d.%d.layer_scale.gamma"
// Attention Components
#define TN_MNV5_ATTN_Q_W "v.blk.%d.%d.attn.query.proj.weight"
#define TN_MNV5_ATTN_K_W "v.blk.%d.%d.attn.key.proj.weight"
#define TN_MNV5_ATTN_V_W "v.blk.%d.%d.attn.value.proj.weight"
#define TN_MNV5_ATTN_O_W "v.blk.%d.%d.attn.output.proj.weight"
#define TN_MNV5_ATTN_K_DW "v.blk.%d.%d.attn.key.down_conv.weight"
#define TN_MNV5_ATTN_K_NORM "v.blk.%d.%d.attn.key.norm.weight"
#define TN_MNV5_ATTN_V_DW "v.blk.%d.%d.attn.value.down_conv.weight"
#define TN_MNV5_ATTN_V_NORM "v.blk.%d.%d.attn.value.norm.weight"
#define TN_MNV5_ATTN_NORM "v.blk.%d.%d.norm.weight" // Block norm used in attn blocks
// MSFA
#define TN_MNV5_MSFA_FFN_EXP_W "v.msfa.ffn.pw_exp.conv.weight"
#define TN_MNV5_MSFA_FFN_EXP_BN "v.msfa.ffn.pw_exp.bn.weight"
#define TN_MNV5_MSFA_FFN_PROJ_W "v.msfa.ffn.pw_proj.conv.weight"
#define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight"
#define TN_MNV5_MSFA_NORM "v.msfa.norm.weight"
// align x to upper multiple of n // align x to upper multiple of n
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n)) #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
@@ -212,8 +155,6 @@ enum projector_type {
PROJECTOR_TYPE_QWEN2VL, PROJECTOR_TYPE_QWEN2VL,
PROJECTOR_TYPE_QWEN3VL, PROJECTOR_TYPE_QWEN3VL,
PROJECTOR_TYPE_GEMMA3, PROJECTOR_TYPE_GEMMA3,
PROJECTOR_TYPE_GEMMA3NV,
PROJECTOR_TYPE_GEMMA3NA,
PROJECTOR_TYPE_IDEFICS3, PROJECTOR_TYPE_IDEFICS3,
PROJECTOR_TYPE_PIXTRAL, PROJECTOR_TYPE_PIXTRAL,
PROJECTOR_TYPE_QWEN25VL, PROJECTOR_TYPE_QWEN25VL,
@@ -224,15 +165,12 @@ enum projector_type {
PROJECTOR_TYPE_GLMA, PROJECTOR_TYPE_GLMA,
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
PROJECTOR_TYPE_VOXTRAL, PROJECTOR_TYPE_VOXTRAL,
PROJECTOR_TYPE_MUSIC_FLAMINGO,
PROJECTOR_TYPE_LFM2, PROJECTOR_TYPE_LFM2,
PROJECTOR_TYPE_KIMIVL, PROJECTOR_TYPE_KIMIVL,
PROJECTOR_TYPE_LIGHTONOCR, PROJECTOR_TYPE_LIGHTONOCR,
PROJECTOR_TYPE_COGVLM, PROJECTOR_TYPE_COGVLM,
PROJECTOR_TYPE_JANUS_PRO, PROJECTOR_TYPE_JANUS_PRO,
PROJECTOR_TYPE_LFM2A,
PROJECTOR_TYPE_GLM4V, PROJECTOR_TYPE_GLM4V,
PROJECTOR_TYPE_YOUTUVL,
PROJECTOR_TYPE_UNKNOWN, PROJECTOR_TYPE_UNKNOWN,
}; };
@@ -246,8 +184,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"}, { PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"},
{ PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"}, { PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"},
{ PROJECTOR_TYPE_GEMMA3, "gemma3"}, { PROJECTOR_TYPE_GEMMA3, "gemma3"},
{ PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"},
{ PROJECTOR_TYPE_GEMMA3NA, "gemma3na"},
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"}, { PROJECTOR_TYPE_IDEFICS3, "idefics3"},
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"}, { PROJECTOR_TYPE_PIXTRAL, "pixtral"},
{ PROJECTOR_TYPE_ULTRAVOX, "ultravox"}, { PROJECTOR_TYPE_ULTRAVOX, "ultravox"},
@@ -257,15 +193,12 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_GLMA, "glma"}, { PROJECTOR_TYPE_GLMA, "glma"},
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"}, { PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"}, { PROJECTOR_TYPE_VOXTRAL, "voxtral"},
{ PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"},
{ PROJECTOR_TYPE_LFM2, "lfm2"}, { PROJECTOR_TYPE_LFM2, "lfm2"},
{ PROJECTOR_TYPE_KIMIVL, "kimivl"}, { PROJECTOR_TYPE_KIMIVL, "kimivl"},
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"}, { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
{ PROJECTOR_TYPE_COGVLM, "cogvlm"}, { PROJECTOR_TYPE_COGVLM, "cogvlm"},
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"}, { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
{ PROJECTOR_TYPE_LFM2A, "lfm2a"},
{ PROJECTOR_TYPE_GLM4V, "glm4v"}, { PROJECTOR_TYPE_GLM4V, "glm4v"},
{ PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
}; };
static projector_type clip_projector_type_from_string(const std::string & str) { static projector_type clip_projector_type_from_string(const std::string & str) {

View File

@@ -4,7 +4,6 @@
#include "clip.h" #include "clip.h"
#include "clip-impl.h" #include "clip-impl.h"
#include <array>
#include <vector> #include <vector>
#include <unordered_set> #include <unordered_set>
#include <cstdint> #include <cstdint>
@@ -61,7 +60,6 @@ struct clip_hparams {
std::unordered_set<int32_t> vision_feature_layer; std::unordered_set<int32_t> vision_feature_layer;
int32_t attn_window_size = 0; int32_t attn_window_size = 0;
int32_t n_wa_pattern = 0; int32_t n_wa_pattern = 0;
std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)
// audio // audio
int32_t n_mel_bins = 0; // whisper preprocessor int32_t n_mel_bins = 0; // whisper preprocessor
@@ -144,74 +142,11 @@ struct clip_layer {
ggml_tensor * deepstack_fc2_w = nullptr; ggml_tensor * deepstack_fc2_w = nullptr;
ggml_tensor * deepstack_fc2_b = nullptr; ggml_tensor * deepstack_fc2_b = nullptr;
// lfm2
ggml_tensor * ff_norm_w = nullptr;
ggml_tensor * ff_norm_b = nullptr;
ggml_tensor * ff_norm_1_w = nullptr;
ggml_tensor * ff_norm_1_b = nullptr;
ggml_tensor * ff_up_1_w = nullptr;
ggml_tensor * ff_up_1_b = nullptr;
ggml_tensor * ff_down_1_w = nullptr;
ggml_tensor * ff_down_1_b = nullptr;
ggml_tensor * pos_bias_u = nullptr;
ggml_tensor * pos_bias_v = nullptr;
ggml_tensor * norm_conv_w = nullptr;
ggml_tensor * norm_conv_b = nullptr;
ggml_tensor * linear_pos_w = nullptr;
ggml_tensor * conv_norm_w = nullptr;
ggml_tensor * conv_norm_b = nullptr;
ggml_tensor * conv_dw_w = nullptr;
ggml_tensor * conv_dw_b = nullptr;
ggml_tensor * conv_pw1_w = nullptr;
ggml_tensor * conv_pw1_b = nullptr;
ggml_tensor * conv_pw2_w = nullptr;
ggml_tensor * conv_pw2_b = nullptr;
bool has_deepstack() const { bool has_deepstack() const {
return deepstack_fc1_w != nullptr; return deepstack_fc1_w != nullptr;
} }
}; };
// Expanded MobileNetV5 block structure for Gemma3n vision encoder
struct mobilenetv5_block {
// Stage 0 (Edge Residual)
ggml_tensor * s0_conv_exp_w = nullptr;
ggml_tensor * s0_bn1_w = nullptr;
ggml_tensor * s0_conv_pwl_w = nullptr;
ggml_tensor * s0_bn2_w = nullptr;
// Stage 1+ (Universal Inverted Residual)
ggml_tensor * dw_start_w = nullptr;
ggml_tensor * dw_start_bn_w = nullptr;
ggml_tensor * pw_exp_w = nullptr;
ggml_tensor * pw_exp_bn_w = nullptr;
ggml_tensor * dw_mid_w = nullptr;
ggml_tensor * dw_mid_bn_w = nullptr;
ggml_tensor * pw_proj_w = nullptr;
ggml_tensor * pw_proj_bn_w = nullptr;
ggml_tensor * layer_scale_w = nullptr;
// Attention (MQA) components
ggml_tensor * attn_q_w = nullptr;
ggml_tensor * attn_k_w = nullptr;
ggml_tensor * attn_v_w = nullptr;
ggml_tensor * attn_o_w = nullptr;
// Optional downsampling/norm in attention
ggml_tensor * attn_k_dw_w = nullptr;
ggml_tensor * attn_k_norm_w = nullptr;
ggml_tensor * attn_v_dw_w = nullptr;
ggml_tensor * attn_v_norm_w = nullptr;
// Block norm (often present in attention blocks)
ggml_tensor * attn_norm_w = nullptr;
};
struct clip_model { struct clip_model {
clip_modality modality = CLIP_MODALITY_VISION; clip_modality modality = CLIP_MODALITY_VISION;
projector_type proj_type = PROJECTOR_TYPE_MLP; projector_type proj_type = PROJECTOR_TYPE_MLP;
@@ -328,23 +263,6 @@ struct clip_model {
ggml_tensor * mm_input_proj_w = nullptr; ggml_tensor * mm_input_proj_w = nullptr;
ggml_tensor * mm_soft_emb_norm_w = nullptr; ggml_tensor * mm_soft_emb_norm_w = nullptr;
// mobilenetv5 for gemma3n
std::vector<mobilenetv5_block> mobilenet_blocks;
std::vector<int> mobilenet_stage_ends;
ggml_tensor * mobilenet_stem_conv_w = nullptr;
ggml_tensor * mobilenet_stem_conv_b = nullptr;
ggml_tensor * mobilenet_stem_norm_w = nullptr;
ggml_tensor * mm_post_proj_norm_w = nullptr;
// Multi-Scale Fusion Adapter (MSFA) components
ggml_tensor * msfa_concat_conv_w = nullptr;
ggml_tensor * msfa_concat_norm_w = nullptr;
ggml_tensor * msfa_ffn_expand_w = nullptr;
ggml_tensor * msfa_ffn_project_w = nullptr;
ggml_tensor * msfa_ffn_expand_bn = nullptr;
ggml_tensor * msfa_ffn_project_bn = nullptr;
// pixtral, glm4v // pixtral, glm4v
ggml_tensor * token_embd_img_break = nullptr; ggml_tensor * token_embd_img_break = nullptr;
ggml_tensor * mm_patch_merger_w = nullptr; ggml_tensor * mm_patch_merger_w = nullptr;
@@ -368,16 +286,9 @@ struct clip_model {
ggml_tensor * mm_boi = nullptr; ggml_tensor * mm_boi = nullptr;
ggml_tensor * mm_eoi = nullptr; ggml_tensor * mm_eoi = nullptr;
// lfm2 audio
std::array<ggml_tensor *, 7> pre_encode_conv_X_w = {nullptr};
std::array<ggml_tensor *, 7> pre_encode_conv_X_b = {nullptr};
ggml_tensor * pre_encode_out_w = nullptr;
ggml_tensor * pre_encode_out_b = nullptr;
bool audio_has_avgpool() const { bool audio_has_avgpool() const {
return proj_type == PROJECTOR_TYPE_QWEN2A return proj_type == PROJECTOR_TYPE_QWEN2A
|| proj_type == PROJECTOR_TYPE_VOXTRAL || proj_type == PROJECTOR_TYPE_VOXTRAL;
|| proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO;
} }
bool audio_has_stack_frames() const { bool audio_has_stack_frames() const {

View File

@@ -165,14 +165,18 @@ struct clip_ctx {
ggml_backend_t backend_cpu = nullptr; ggml_backend_t backend_cpu = nullptr;
ggml_backend_buffer_ptr buf; ggml_backend_buffer_ptr buf;
int max_nodes = 8192; int max_nodes = 8192;
ggml_backend_sched_ptr sched; ggml_backend_sched_ptr sched;
clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO; clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
bool is_allocated = false; bool is_allocated = false;
// for debugging
bool debug_graph = false;
std::vector<ggml_tensor *> debug_print_tensors;
clip_ctx(clip_context_params & ctx_params) { clip_ctx(clip_context_params & ctx_params) {
flash_attn_type = ctx_params.flash_attn_type; flash_attn_type = ctx_params.flash_attn_type;
debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr;
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
if (!backend_cpu) { if (!backend_cpu) {
throw std::runtime_error("failed to initialize CPU backend"); throw std::runtime_error("failed to initialize CPU backend");
@@ -213,10 +217,6 @@ struct clip_ctx {
sched.reset( sched.reset(
ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true) ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true)
); );
if (ctx_params.cb_eval != nullptr) {
ggml_backend_sched_set_eval_callback(sched.get(), ctx_params.cb_eval, ctx_params.cb_eval_user_data);
}
} }
~clip_ctx() { ~clip_ctx() {
@@ -252,7 +252,9 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
n_mmproj_embd(clip_n_mmproj_embd(ctx)), n_mmproj_embd(clip_n_mmproj_embd(ctx)),
eps(hparams.eps), eps(hparams.eps),
kq_scale(1.0f / sqrtf((float)d_head)), kq_scale(1.0f / sqrtf((float)d_head)),
flash_attn_type(ctx->flash_attn_type) { flash_attn_type(ctx->flash_attn_type),
debug_graph(ctx->debug_graph),
debug_print_tensors(ctx->debug_print_tensors) {
struct ggml_init_params params = { struct ggml_init_params params = {
/*.mem_size =*/ ctx->buf_compute_meta.size(), /*.mem_size =*/ ctx->buf_compute_meta.size(),
/*.mem_buffer =*/ ctx->buf_compute_meta.data(), /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
@@ -263,11 +265,14 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false); gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
} }
void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const { void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const {
if (il >= 0) { if (debug_graph) {
ggml_format_name(cur, "%s-%d", name, il); ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0));
} else { std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name;
ggml_set_name(cur, name); ggml_set_name(cur, cur_name.c_str());
ggml_set_output(cur);
ggml_build_forward_expand(gf, cur);
debug_print_tensors.push_back(cur);
} }
} }
@@ -796,10 +801,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{ {
builder = std::make_unique<clip_graph_siglip>(ctx, img); builder = std::make_unique<clip_graph_siglip>(ctx, img);
} break; } break;
case PROJECTOR_TYPE_GEMMA3NV:
{
builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
} break;
case PROJECTOR_TYPE_PIXTRAL: case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_LIGHTONOCR: case PROJECTOR_TYPE_LIGHTONOCR:
{ {
@@ -830,7 +831,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_QWEN2A: case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_GLMA: case PROJECTOR_TYPE_GLMA:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
{ {
builder = std::make_unique<clip_graph_whisper_enc>(ctx, img); builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
} break; } break;
@@ -850,18 +850,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{ {
builder = std::make_unique<clip_graph_llava>(ctx, img); builder = std::make_unique<clip_graph_llava>(ctx, img);
} break; } break;
case PROJECTOR_TYPE_LFM2A:
{
builder = std::make_unique<clip_graph_conformer>(ctx, img);
} break;
case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_GLM4V:
{ {
builder = std::make_unique<clip_graph_glm4v>(ctx, img); builder = std::make_unique<clip_graph_glm4v>(ctx, img);
} break; } break;
case PROJECTOR_TYPE_YOUTUVL:
{
builder = std::make_unique<clip_graph_youtuvl>(ctx, img);
} break;
default: default:
GGML_ABORT("missing cgraph builder"); GGML_ABORT("missing cgraph builder");
} }
@@ -1162,14 +1154,6 @@ struct clip_model_loader {
// test model (tinygemma3) has a different value, we optionally read it // test model (tinygemma3) has a different value, we optionally read it
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
} break; } break;
case PROJECTOR_TYPE_GEMMA3NV:
{
// Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
// Similar configuration to Gemma3
hparams.n_merge = 1; // MobileNetV5 handles resizing internally
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
} break;
case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_QWEN3VL:
@@ -1187,20 +1171,6 @@ struct clip_model_loader {
LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__); LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
} }
} break; } break;
case PROJECTOR_TYPE_YOUTUVL:
{
hparams.n_merge = 2;
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
std::vector<int> wa_layer_indexes_vec;
get_arr_int(KEY_WIN_ATTN_LAYER_INDEXES, wa_layer_indexes_vec, true);
for (auto & layer : wa_layer_indexes_vec) {
hparams.wa_layer_indexes.insert(layer);
}
// support max_height * max_width = 8000 * 8000. 8000/16/2 = 250 image tokens
hparams.set_limit_image_tokens(1, 62500);
hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup
} break;
case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_GLM4V:
{ {
hparams.rope_theta = 10000.0f; hparams.rope_theta = 10000.0f;
@@ -1219,7 +1189,6 @@ struct clip_model_loader {
case PROJECTOR_TYPE_QWEN2A: case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_GLMA: case PROJECTOR_TYPE_GLMA:
case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
{ {
bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX || bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
model.proj_type == PROJECTOR_TYPE_VOXTRAL || model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
@@ -1235,15 +1204,6 @@ struct clip_model_loader {
hparams.audio_window_len = 400; hparams.audio_window_len = 400;
hparams.audio_hop_len = 160; hparams.audio_hop_len = 160;
} break; } break;
case PROJECTOR_TYPE_LFM2A:
{
// audio preprocessing params
hparams.audio_chunk_len = 1; // in seconds
hparams.audio_sample_rate = 16000;
hparams.audio_n_fft = 512;
hparams.audio_window_len = 400;
hparams.audio_hop_len = 160;
} break;
default: default:
break; break;
} }
@@ -1269,14 +1229,7 @@ struct clip_model_loader {
LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector); LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector);
LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version); LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version);
LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge); LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge);
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern); LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
if (!hparams.wa_layer_indexes.empty()) {
LOG_INF("%s: wa_layer_indexes: ", __func__);
for (auto & layer : hparams.wa_layer_indexes) {
LOG_INF("%d ", layer);
}
LOG_INF("\n");
}
if (hparams.image_min_pixels > 0) { if (hparams.image_min_pixels > 0) {
LOG_INF("%s: image_min_pixels: %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : ""); LOG_INF("%s: image_min_pixels: %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : "");
} }
@@ -1358,10 +1311,6 @@ struct clip_model_loader {
model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false); model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) {
hparams.n_layer = 0; // gemma3n does not use normal layer structure
}
// layers // layers
model.layers.resize(hparams.n_layer); model.layers.resize(hparams.n_layer);
for (int il = 0; il < hparams.n_layer; ++il) { for (int il = 0; il < hparams.n_layer; ++il) {
@@ -1436,7 +1385,6 @@ struct clip_model_loader {
} }
} }
switch (model.proj_type) { switch (model.proj_type) {
case PROJECTOR_TYPE_MLP: case PROJECTOR_TYPE_MLP:
case PROJECTOR_TYPE_MLP_NORM: case PROJECTOR_TYPE_MLP_NORM:
@@ -1531,8 +1479,8 @@ struct clip_model_loader {
model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight")); model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight")); model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight")); model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI)); model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI)); model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
} break; } break;
case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN25VL:
@@ -1549,14 +1497,6 @@ struct clip_model_loader {
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
} break; } break;
case PROJECTOR_TYPE_YOUTUVL:
{
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); // merger.ln_q (RMS norm)
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); // merger.mlp.0
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); // merger.mlp.2
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
} break;
case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_GLM4V:
{ {
model.projection = get_tensor(TN_MM_PROJECTOR); model.projection = get_tensor(TN_MM_PROJECTOR);
@@ -1576,112 +1516,11 @@ struct clip_model_loader {
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N); model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
} break; } break;
case PROJECTOR_TYPE_GEMMA3NV:
{
model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false);
model.mobilenet_stem_norm_w = get_tensor(TN_MNV5_STEM_BN, false);
model.msfa_ffn_expand_w = get_tensor(TN_MNV5_MSFA_FFN_EXP_W, false);
model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded
model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false);
model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false);
model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false);
// Dynamically load blocks stage by stage
for (int stage = 0; stage < 4; ++stage) {
int blocks_found_in_stage = 0;
for (int blk_idx = 0; ; ++blk_idx) {
bool found_block = false;
mobilenetv5_block block;
// 1. Check for Edge Residual (S0)
block.s0_conv_exp_w = get_tensor(string_format(TN_MNV5_BLK_S0_EXP_W, stage, blk_idx), false);
if (block.s0_conv_exp_w) {
found_block = true;
block.s0_bn1_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false);
block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false);
block.s0_bn2_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false);
}
// 2. Check for UIR (Universal Inverted Residual)
else {
// Check for dw_start OR pw_exp (some UIR blocks skip dw_start)
block.dw_start_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_W, stage, blk_idx), false);
block.pw_exp_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_W, stage, blk_idx), false);
if (block.dw_start_w || block.pw_exp_w) {
found_block = true;
if (block.dw_start_w) {
block.dw_start_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_BN, stage, blk_idx), false);
}
if (block.pw_exp_w) {
block.pw_exp_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_BN, stage, blk_idx), false);
}
block.dw_mid_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_W, stage, blk_idx), false);
if (block.dw_mid_w) {
block.dw_mid_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_BN, stage, blk_idx), false);
}
block.pw_proj_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_W, stage, blk_idx), false);
if (block.pw_proj_w) {
block.pw_proj_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_BN, stage, blk_idx), false);
}
block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
}
}
// 3. Check for Attention (MQA)
// Even if UIR/Edge check failed, this might be a pure attention block
ggml_tensor* attn_q_check = get_tensor(string_format(TN_MNV5_ATTN_Q_W, stage, blk_idx), false);
if (attn_q_check) {
found_block = true;
block.attn_q_w = attn_q_check;
block.attn_k_w = get_tensor(string_format(TN_MNV5_ATTN_K_W, stage, blk_idx), false);
block.attn_v_w = get_tensor(string_format(TN_MNV5_ATTN_V_W, stage, blk_idx), false);
block.attn_o_w = get_tensor(string_format(TN_MNV5_ATTN_O_W, stage, blk_idx), false);
block.attn_k_dw_w = get_tensor(string_format(TN_MNV5_ATTN_K_DW, stage, blk_idx), false);
block.attn_k_norm_w = get_tensor(string_format(TN_MNV5_ATTN_K_NORM, stage, blk_idx), false);
block.attn_v_dw_w = get_tensor(string_format(TN_MNV5_ATTN_V_DW, stage, blk_idx), false);
block.attn_v_norm_w = get_tensor(string_format(TN_MNV5_ATTN_V_NORM, stage, blk_idx), false);
block.attn_norm_w = get_tensor(string_format(TN_MNV5_ATTN_NORM, stage, blk_idx), false);
// Note: Attention blocks also have layer_scale, load it if not already loaded by UIR check
if (!block.layer_scale_w) {
block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
}
}
if (found_block) {
model.mobilenet_blocks.push_back(block);
blocks_found_in_stage++;
} else {
// End of blocks for this stage
break;
}
}
// Track where this stage ends in the flat vector
if (blocks_found_in_stage > 0) {
model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1);
LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1);
}
}
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
} break;
case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_IDEFICS3:
{ {
model.projection = get_tensor(TN_MM_PROJECTOR); model.projection = get_tensor(TN_MM_PROJECTOR);
} break; } break;
case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_LFM2:
{
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B, false);
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
} break;
case PROJECTOR_TYPE_KIMIVL: case PROJECTOR_TYPE_KIMIVL:
{ {
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
@@ -1741,17 +1580,6 @@ struct clip_model_loader {
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight")); model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight")); model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
} break; } break;
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
{
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
} break;
case PROJECTOR_TYPE_INTERNVL: case PROJECTOR_TYPE_INTERNVL:
{ {
model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight")); model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
@@ -1773,8 +1601,8 @@ struct clip_model_loader {
model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias")); model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight")); model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias")); model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias"));
model.mm_boi = get_tensor(string_format(TN_TOK_BOI)); model.mm_boi = get_tensor(string_format(TN_TOK_BOI, "weight"));
model.mm_eoi = get_tensor(string_format(TN_TOK_EOI)); model.mm_eoi = get_tensor(string_format(TN_TOK_EOI, "weight"));
} break; } break;
case PROJECTOR_TYPE_LLAMA4: case PROJECTOR_TYPE_LLAMA4:
{ {
@@ -1800,52 +1628,6 @@ struct clip_model_loader {
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias")); model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
} break; } break;
case PROJECTOR_TYPE_LFM2A:
{
for (int i : {0, 2, 3, 5, 6}) {
model.pre_encode_conv_X_w[i] = get_tensor(string_format(TN_CONV1D, i, "weight"));
model.pre_encode_conv_X_b[i] = get_tensor(string_format(TN_CONV1D, i, "bias"));
}
model.pre_encode_out_w = get_tensor(string_format(TN_PRE_ENCODE_OUT, "weight"));
model.pre_encode_out_b = get_tensor(string_format(TN_PRE_ENCODE_OUT, "bias"));
model.mm_0_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "weight"));
model.mm_0_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "bias"));
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
model.mm_3_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "weight"));
model.mm_3_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "bias"));
for (int il = 0; il < hparams.n_layer; ++il) {
auto & layer = model.layers[il];
layer.ff_norm_w = get_tensor(string_format(TN_FFN_NORM, prefix, il, "weight"));
layer.ff_norm_b = get_tensor(string_format(TN_FFN_NORM, prefix, il, "bias"));
layer.ff_norm_1_w = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "weight"));
layer.ff_norm_1_b = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "bias"));
layer.ff_up_1_w = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "weight"));
layer.ff_up_1_b = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "bias"));
layer.ff_down_1_w = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "weight"));
layer.ff_down_1_b = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "bias"));
layer.pos_bias_u = get_tensor(string_format(TN_POS_BIAS_U, prefix, il));
layer.pos_bias_v = get_tensor(string_format(TN_POS_BIAS_V, prefix, il));
layer.norm_conv_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight"));
layer.norm_conv_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias"));
layer.linear_pos_w = get_tensor(string_format(TN_LINEAR_POS, prefix, il, "weight"));
layer.conv_norm_w = get_tensor(string_format(TN_CONV_NORM, prefix, il, "weight"));
layer.conv_norm_b = get_tensor(string_format(TN_CONV_NORM, prefix, il, "bias"));
layer.conv_dw_w = get_tensor(string_format(TN_CONV_DW, prefix, il, "weight"));
layer.conv_dw_b = get_tensor(string_format(TN_CONV_DW, prefix, il, "bias"));
layer.conv_pw1_w = get_tensor(string_format(TN_CONV_PW1, prefix, il, "weight"));
layer.conv_pw1_b = get_tensor(string_format(TN_CONV_PW1, prefix, il, "bias"));
layer.conv_pw2_w = get_tensor(string_format(TN_CONV_PW2, prefix, il, "weight"));
layer.conv_pw2_b = get_tensor(string_format(TN_CONV_PW2, prefix, il, "bias"));
}
} break;
default: default:
GGML_ASSERT(false && "unknown projector type"); GGML_ASSERT(false && "unknown projector type");
} }
@@ -2150,7 +1932,6 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
try { try {
clip_model_loader loader(fname); clip_model_loader loader(fname);
bool skip_audio = false;
if (loader.has_vision) { if (loader.has_vision) {
ctx_vision = new clip_ctx(ctx_params); ctx_vision = new clip_ctx(ctx_params);
@@ -2160,14 +1941,10 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
loader.warmup(*ctx_vision); loader.warmup(*ctx_vision);
} }
// TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
// we can remove this check when we implement audio support for Gemma 3N
skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
// clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f); // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
} }
if (loader.has_audio && !skip_audio) { if (loader.has_audio) {
ctx_audio = new clip_ctx(ctx_params); ctx_audio = new clip_ctx(ctx_params);
loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO); loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
loader.load_tensors(*ctx_audio); loader.load_tensors(*ctx_audio);
@@ -2290,7 +2067,7 @@ struct img_tool {
std::array<uint8_t, 3> pad_color = {0, 0, 0}) { std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
dst.nx = target_resolution.width; dst.nx = target_resolution.width;
dst.ny = target_resolution.height; dst.ny = target_resolution.height;
dst.buf.resize(3 * static_cast<size_t>(dst.nx) * static_cast<size_t>(dst.ny)); dst.buf.resize(3 * dst.nx * dst.ny);
if (dst.nx == src.nx && dst.ny == src.ny) { if (dst.nx == src.nx && dst.ny == src.ny) {
// no resize needed, simple copy // no resize needed, simple copy
@@ -2343,7 +2120,7 @@ struct img_tool {
static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) { static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
dst.nx = w; dst.nx = w;
dst.ny = h; dst.ny = h;
dst.buf.resize(3 * static_cast<size_t>(w) * static_cast<size_t>(h)); dst.buf.resize(3 * w * h);
for (int i = 0; i < h; ++i) { for (int i = 0; i < h; ++i) {
for (int j = 0; j < w; ++j) { for (int j = 0; j < w; ++j) {
@@ -2440,7 +2217,7 @@ private:
static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) { static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
dst.nx = target_width; dst.nx = target_width;
dst.ny = target_height; dst.ny = target_height;
dst.buf.resize(3 * static_cast<size_t>(target_width) * static_cast<size_t>(target_height)); dst.buf.resize(3 * target_width * target_height);
float x_ratio = static_cast<float>(src.nx - 1) / target_width; float x_ratio = static_cast<float>(src.nx - 1) / target_width;
float y_ratio = static_cast<float>(src.ny - 1) / target_height; float y_ratio = static_cast<float>(src.ny - 1) / target_height;
@@ -2479,7 +2256,7 @@ private:
dst.nx = target_width; dst.nx = target_width;
dst.ny = target_height; dst.ny = target_height;
dst.buf.resize(3 * static_cast<size_t>(target_width) * static_cast<size_t>(target_height)); dst.buf.resize(3 * target_width * target_height);
float Cc; float Cc;
float C[5] = {}; float C[5] = {};
@@ -2891,57 +2668,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
// res_imgs->data[0] = *res; // res_imgs->data[0] = *res;
res_imgs->entries.push_back(std::move(img_f32)); res_imgs->entries.push_back(std::move(img_f32));
} break; } break;
case PROJECTOR_TYPE_YOUTUVL:
{
const int patch_size = params.patch_size; // typically 16
const int merge_size = params.n_merge; // typically 2
const int align_size = patch_size * merge_size; // 32
const int max_num_patches = params.image_max_pixels > 0 ?
params.image_max_pixels / (patch_size * patch_size) : 256;
// Linear search for optimal scale to fit within max_num_patches
float scale = 1.0f;
int target_height = original_size.height;
int target_width = original_size.width;
auto get_scaled_image_size = [align_size](float scale, int size) -> int {
float scaled_size = size * scale;
// Round up to nearest multiple of align_size
int aligned = static_cast<int>(std::ceil(scaled_size / align_size)) * align_size;
// Ensure at least one patch
return std::max(align_size, aligned);
};
// Linear search with 0.02 step size
while (scale > 0.0f) {
target_height = get_scaled_image_size(scale, original_size.height);
target_width = get_scaled_image_size(scale, original_size.width);
int num_patches_h = target_height / patch_size;
int num_patches_w = target_width / patch_size;
int num_patches = num_patches_h * num_patches_w;
if (num_patches > max_num_patches) {
scale -= 0.02f;
} else {
break;
}
}
clip_image_size new_size = {target_width, target_height};
// Resize the image
clip_image_u8 resized;
img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
// Normalize to float32
clip_image_f32_ptr img_f32(clip_image_f32_init());
normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
// Add to results
res_imgs->entries.push_back(std::move(img_f32));
} break;
case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_IDEFICS3:
{ {
@@ -3005,16 +2731,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
res_imgs->entries.push_back(std::move(img_f32)); res_imgs->entries.push_back(std::move(img_f32));
} break; } break;
case PROJECTOR_TYPE_GEMMA3NV:
{
clip_image_u8 resized_image;
int sz = params.image_size;
img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false);
clip_image_f32_ptr img_f32(clip_image_f32_init());
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
res_imgs->entries.push_back(std::move(img_f32));
} break;
case PROJECTOR_TYPE_JANUS_PRO: case PROJECTOR_TYPE_JANUS_PRO:
{ {
// Janus Pro preprocessing: pad to square with gray(127), resize to 384x384 // Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
@@ -3184,7 +2900,6 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_GLM4V:
case PROJECTOR_TYPE_YOUTUVL:
return (img->nx / params.patch_size) / 2; return (img->nx / params.patch_size) / 2;
default: default:
break; break;
@@ -3200,7 +2915,6 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_GLM4V:
case PROJECTOR_TYPE_YOUTUVL:
return (img->ny / params.patch_size) / 2; return (img->ny / params.patch_size) / 2;
default: default:
break; break;
@@ -3261,7 +2975,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_GLM4V:
case PROJECTOR_TYPE_YOUTUVL:
{ {
// dynamic size (2 conv, so double patch size) // dynamic size (2 conv, so double patch size)
int x_patch = img->nx / (params.patch_size * 2); int x_patch = img->nx / (params.patch_size * 2);
@@ -3277,12 +2990,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
int scale_factor = ctx->model.hparams.n_merge; int scale_factor = ctx->model.hparams.n_merge;
n_patches /= (scale_factor * scale_factor); n_patches /= (scale_factor * scale_factor);
} break; } break;
case PROJECTOR_TYPE_GEMMA3NV:
{
// MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution
// regardless of input size (see architecture description)
n_patches = ctx->model.hparams.image_size / ctx->model.hparams.patch_size;
} break;
case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_KIMIVL: case PROJECTOR_TYPE_KIMIVL:
{ {
@@ -3308,7 +3015,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_QWEN2A: case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
{ {
n_patches = img->nx; n_patches = img->nx;
@@ -3341,10 +3047,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
{ {
n_patches += 2; // for BOI and EOI token embeddings n_patches += 2; // for BOI and EOI token embeddings
} break; } break;
case PROJECTOR_TYPE_LFM2A:
{
n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
} break;
default: default:
GGML_ABORT("unsupported projector type"); GGML_ABORT("unsupported projector type");
} }
@@ -3377,6 +3079,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
} }
// build the inference graph // build the inference graph
ctx->debug_print_tensors.clear();
ggml_backend_sched_reset(ctx->sched.get()); ggml_backend_sched_reset(ctx->sched.get());
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs); ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
ggml_backend_sched_alloc_graph(ctx->sched.get(), gf); ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
@@ -3394,6 +3097,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
const int pos_w = image_size_width / patch_size; const int pos_w = image_size_width / patch_size;
const int pos_h = image_size_height / patch_size; const int pos_h = image_size_height / patch_size;
const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
auto get_inp_tensor = [&gf](const char * name) { auto get_inp_tensor = [&gf](const char * name) {
ggml_tensor * inp = ggml_graph_get_tensor(gf, name); ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
@@ -3542,11 +3246,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
set_input_i32("positions", positions); set_input_i32("positions", positions);
} break; } break;
case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_YOUTUVL:
{ {
// pw * ph = number of tokens output by ViT after apply patch merger // pw * ph = number of tokens output by ViT after apply patch merger
// ipw * ipw = number of vision token been processed inside ViT // ipw * ipw = number of vision token been processed inside ViT
const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty();
const int merge_ratio = 2; const int merge_ratio = 2;
const int pw = image_size_width / patch_size / merge_ratio; const int pw = image_size_width / patch_size / merge_ratio;
const int ph = image_size_height / patch_size / merge_ratio; const int ph = image_size_height / patch_size / merge_ratio;
@@ -3557,7 +3259,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
std::vector<int> inv_idx(ph * pw); std::vector<int> inv_idx(ph * pw);
if (use_window_attn) { if (use_window_attn) {
const int attn_window_size = hparams.attn_window_size > 0 ? hparams.attn_window_size : 112; const int attn_window_size = 112;
const int grid_window = attn_window_size / patch_size / merge_ratio; const int grid_window = attn_window_size / patch_size / merge_ratio;
int dst = 0; int dst = 0;
// [num_vision_tokens, num_vision_tokens] attention mask tensor // [num_vision_tokens, num_vision_tokens] attention mask tensor
@@ -3674,7 +3376,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
set_input_i32("patches", patches); set_input_i32("patches", patches);
} break; } break;
case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_GEMMA3NV:
case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_IDEFICS3:
case PROJECTOR_TYPE_INTERNVL: case PROJECTOR_TYPE_INTERNVL:
case PROJECTOR_TYPE_QWEN2A: case PROJECTOR_TYPE_QWEN2A:
@@ -3682,7 +3383,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
case PROJECTOR_TYPE_JANUS_PRO: case PROJECTOR_TYPE_JANUS_PRO:
case PROJECTOR_TYPE_COGVLM: case PROJECTOR_TYPE_COGVLM:
{ {
@@ -3705,27 +3405,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
} }
set_input_i32("pos_w", pos_data); set_input_i32("pos_w", pos_data);
} break; } break;
case PROJECTOR_TYPE_LFM2A:
{
GGML_ASSERT(imgs.entries.size() == 1);
const auto n_frames = clip_n_output_tokens(ctx, imgs.entries.front().get());
auto d_model = 512;
auto seq_len = n_frames * 2 - 1;
std::vector<float> pos_emb(d_model*seq_len);
std::vector<double> inv_freq(d_model / 2);
for (size_t i = 0; i < inv_freq.size(); ++i) {
inv_freq[i] = std::exp(-(std::log(10000.0) / (float)d_model) * (2.0f * (float)(i)));
}
for (int64_t pos = 0; pos < seq_len; ++pos) {
for (size_t i = 0; i < inv_freq.size(); ++i) {
const float ang = (n_frames - pos - 1) * inv_freq[i];
pos_emb[pos*d_model + 2*i + 0] = sinf(ang); // even
pos_emb[pos*d_model + 2*i + 1] = cosf(ang); // odd
}
}
set_input_f32("pos_emb", pos_emb);
} break;
default: default:
GGML_ABORT("Unknown projector type"); GGML_ABORT("Unknown projector type");
} }
@@ -3746,6 +3425,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
return false; return false;
} }
// print debug nodes
if (ctx->debug_graph) {
LOG_INF("\n\n---\n\n");
LOG_INF("\n\nDebug graph:\n\n");
for (ggml_tensor * t : ctx->debug_print_tensors) {
std::vector<uint8_t> data(ggml_nbytes(t));
ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
print_tensor_shape(t);
print_tensor_data(t, data.data(), 3);
}
}
// the last node is the embedding tensor // the last node is the embedding tensor
ggml_tensor * embeddings = ggml_graph_node(gf, -1); ggml_tensor * embeddings = ggml_graph_node(gf, -1);
@@ -3784,19 +3475,16 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_JANUS_PRO: case PROJECTOR_TYPE_JANUS_PRO:
case PROJECTOR_TYPE_YOUTUVL:
return ctx->model.mm_1_b->ne[0]; return ctx->model.mm_1_b->ne[0];
case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_QWEN3VL:
// main path + deepstack paths // main path + deepstack paths
return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers); return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_GEMMA3NV:
return ctx->model.mm_input_proj_w->ne[0]; return ctx->model.mm_input_proj_w->ne[0];
case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_IDEFICS3:
return ctx->model.projection->ne[1]; return ctx->model.projection->ne[1];
case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
return ctx->model.mm_2_w->ne[1]; return ctx->model.mm_2_w->ne[1];
case PROJECTOR_TYPE_INTERNVL: case PROJECTOR_TYPE_INTERNVL:
return ctx->model.mm_3_w->ne[1]; return ctx->model.mm_3_w->ne[1];
@@ -3811,8 +3499,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return ctx->model.mm_2_w->ne[1]; return ctx->model.mm_2_w->ne[1];
case PROJECTOR_TYPE_COGVLM: case PROJECTOR_TYPE_COGVLM:
return ctx->model.mm_4h_to_h_w->ne[1]; return ctx->model.mm_4h_to_h_w->ne[1];
case PROJECTOR_TYPE_LFM2A:
return ctx->model.position_embeddings->ne[0];
case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_GLM4V:
return ctx->model.mm_ffn_down_w->ne[1]; return ctx->model.mm_ffn_down_w->ne[1];
default: default:
@@ -3821,7 +3507,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
} }
int clip_is_minicpmv(const struct clip_ctx * ctx) { int clip_is_minicpmv(const struct clip_ctx * ctx) {
// TODO: remove this function
if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) { if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
return ctx->model.hparams.minicpmv_version; return ctx->model.hparams.minicpmv_version;
} }
@@ -3829,14 +3514,24 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
} }
bool clip_is_glm(const struct clip_ctx * ctx) { bool clip_is_glm(const struct clip_ctx * ctx) {
// TODO: remove this function
return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE; return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
} }
bool clip_is_mrope(const struct clip_ctx * ctx) {
return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL
|| ctx->proj_type() == PROJECTOR_TYPE_GLM4V;
}
bool clip_is_llava(const struct clip_ctx * ctx) { bool clip_is_llava(const struct clip_ctx * ctx) {
return ctx->model.hparams.has_llava_projector; return ctx->model.hparams.has_llava_projector;
} }
bool clip_is_gemma3(const struct clip_ctx * ctx) {
return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
}
bool clip_has_vision_encoder(const struct clip_ctx * ctx) { bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
return ctx->model.modality == CLIP_MODALITY_VISION; return ctx->model.modality == CLIP_MODALITY_VISION;
} }
@@ -3846,16 +3541,10 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
} }
bool clip_has_whisper_encoder(const struct clip_ctx * ctx) { bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
switch (ctx->proj_type()) { return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
case PROJECTOR_TYPE_ULTRAVOX: || ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
case PROJECTOR_TYPE_QWEN2A: || ctx->proj_type() == PROJECTOR_TYPE_GLMA
case PROJECTOR_TYPE_GLMA: || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
return true;
default:
return false;
}
} }
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
@@ -3897,6 +3586,7 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
// //
// API for debugging // API for debugging
// //
void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) { void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
clip_image_f32 img; clip_image_f32 img;
img.nx = w; img.nx = w;
@@ -3905,6 +3595,9 @@ void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
for (int i = 0; i < h * w * 3; i++) { for (int i = 0; i < h * w * 3; i++) {
img.buf[i] = static_cast<float>(fill_value); img.buf[i] = static_cast<float>(fill_value);
} }
bool cur_debug_graph = ctx->debug_graph;
ctx->debug_graph = true;
clip_image_encode(ctx, 1, &img, nullptr); clip_image_encode(ctx, 1, &img, nullptr);
ctx->debug_graph = cur_debug_graph;
GGML_ASSERT(img.buf.empty() && "expected, always stop here"); GGML_ASSERT(img.buf.empty() && "expected, always stop here");
} }

View File

@@ -1,7 +1,6 @@
#pragma once #pragma once
#include "ggml.h" #include "ggml.h"
#include "mtmd.h"
#include <stddef.h> #include <stddef.h>
#include <stdint.h> #include <stdint.h>
@@ -38,8 +37,6 @@ struct clip_context_params {
int image_min_tokens; int image_min_tokens;
int image_max_tokens; int image_max_tokens;
bool warmup; bool warmup;
ggml_backend_sched_eval_callback cb_eval;
void * cb_eval_user_data;
}; };
struct clip_init_result { struct clip_init_result {
@@ -107,9 +104,9 @@ bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct
int clip_is_minicpmv(const struct clip_ctx * ctx); int clip_is_minicpmv(const struct clip_ctx * ctx);
bool clip_is_glm(const struct clip_ctx * ctx); bool clip_is_glm(const struct clip_ctx * ctx);
bool clip_is_mrope(const struct clip_ctx * ctx);
bool clip_is_llava(const struct clip_ctx * ctx); bool clip_is_llava(const struct clip_ctx * ctx);
// note for contributor: this clip_is_(model) pattern is deprecated bool clip_is_gemma3(const struct clip_ctx * ctx);
// do NOT add new functions like this
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec); bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);

View File

@@ -1,216 +0,0 @@
#include "models.h"
ggml_cgraph * clip_graph_conformer::build() {
const int n_frames = img.nx;
const int n_pos = n_frames / 2;
const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1;
GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
ggml_tensor * pos_emb = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 512, n_pos_embd);
ggml_set_name(pos_emb, "pos_emb");
ggml_set_input(pos_emb);
ggml_build_forward_expand(gf, pos_emb);
ggml_tensor * inp = build_inp_raw(1);
auto * cur = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
// pre encode, conv subsampling
{
// layer.0 - conv2d
cur = ggml_conv_2d(ctx0, model.pre_encode_conv_X_w[0], cur, 2, 2, 1, 1, 1, 1);
cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[0]);
cb(cur, "conformer.pre_encode.conv.{}", 0);
// layer.1 - relu
cur = ggml_relu_inplace(ctx0, cur);
// layer.2 conv2d dw
cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[2], cur, 2, 2, 1, 1, 1, 1);
cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[2]);
cb(cur, "conformer.pre_encode.conv.{}", 2);
// layer.3 conv2d
cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[3], cur, 1, 1, 0, 0, 1, 1);
cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[3]);
cb(cur, "conformer.pre_encode.conv.{}", 3);
// layer.4 - relu
cur = ggml_relu_inplace(ctx0, cur);
// layer.5 conv2d dw
cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[5], cur, 2, 2, 1, 1, 1, 1);
cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[5]);
cb(cur, "conformer.pre_encode.conv.{}", 5);
// layer.6 conv2d
cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[6], cur, 1, 1, 0, 0, 1, 1);
cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[6]);
cb(cur, "conformer.pre_encode.conv.{}", 6);
// layer.7 - relu
cur = ggml_relu_inplace(ctx0, cur);
// flatten channel and frequency axis
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]);
// calculate out
cur = ggml_mul_mat(ctx0, model.pre_encode_out_w, cur);
cur = ggml_add(ctx0, cur, model.pre_encode_out_b);
cb(cur, "conformer.pre_encode.out", -1);
}
// pos_emb
cb(pos_emb, "pos_emb", -1);
for (int il = 0; il < hparams.n_layer; il++) {
const auto & layer = model.layers[il];
auto * residual = cur;
cb(cur, "layer.in", il);
// feed_forward1
cur = build_norm(cur, layer.ff_norm_w, layer.ff_norm_b, NORM_TYPE_NORMAL, 1e-5, il);
cb(cur, "conformer.layers.{}.norm_feed_forward1", il);
cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, FFN_SILU,
il);
cb(cur, "conformer.layers.{}.feed_forward1.linear2", il);
const auto fc_factor = 0.5f;
residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor));
// self-attention
{
cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il);
cb(cur, "conformer.layers.{}.norm_self_att", il);
ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]);
ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u);
Q_bias_u = ggml_permute(ctx0, Q_bias_u, 0, 2, 1, 3);
ggml_tensor * Q_bias_v = ggml_add(ctx0, Qcur, layer.pos_bias_v);
Q_bias_v = ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3);
// TODO @ngxson : some cont can/should be removed when ggml_mul_mat support these cases
ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]);
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]);
Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 2, 0, 3));
// build_attn won't fit due to matrix_ac and matrix_bd separation
ggml_tensor * matrix_ac = ggml_mul_mat(ctx0, Q_bias_u, Kcur);
matrix_ac = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3));
cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il);
auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb);
cb(p, "conformer.layers.{}.self_attn.linear_pos", il);
p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]);
p = ggml_permute(ctx0, p, 0, 2, 1, 3);
auto * matrix_bd = ggml_mul_mat(ctx0, Q_bias_v, p);
matrix_bd = ggml_cont(ctx0, ggml_permute(ctx0, matrix_bd, 1, 0, 2, 3));
// rel shift
{
const auto pos_len = matrix_bd->ne[0];
const auto q_len = matrix_bd->ne[1];
const auto h = matrix_bd->ne[2];
matrix_bd = ggml_pad(ctx0, matrix_bd, 1, 0, 0, 0);
matrix_bd = ggml_roll(ctx0, matrix_bd, 1, 0, 0, 0);
matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, q_len, pos_len + 1, h);
matrix_bd = ggml_view_3d(ctx0, matrix_bd, q_len, pos_len, h, matrix_bd->nb[1],
matrix_bd->nb[2], matrix_bd->nb[0] * q_len);
matrix_bd = ggml_cont_3d(ctx0, matrix_bd, pos_len, q_len, h);
}
matrix_bd = ggml_view_3d(ctx0, matrix_bd, matrix_ac->ne[0], matrix_bd->ne[1],
matrix_bd->ne[2], matrix_bd->nb[1], matrix_bd->nb[2], 0);
auto * scores = ggml_add(ctx0, matrix_ac, matrix_bd);
scores = ggml_scale(ctx0, scores, 1.0f / std::sqrt(d_head));
cb(scores, "conformer.layers.{}.self_attn.id0", il);
ggml_tensor * attn = ggml_soft_max(ctx0, scores);
ggml_tensor * x = ggml_mul_mat(ctx0, attn, Vcur);
x = ggml_permute(ctx0, x, 2, 0, 1, 3);
x = ggml_cont_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]);
ggml_tensor * out = ggml_mul_mat(ctx0, layer.o_w, x);
out = ggml_add(ctx0, out, layer.o_b);
cb(out, "conformer.layers.{}.self_attn.linear_out", il);
cur = out;
}
residual = ggml_add(ctx0, residual, cur);
cur = build_norm(residual, layer.norm_conv_w, layer.norm_conv_b, NORM_TYPE_NORMAL, 1e-5, il);
cb(cur, "conformer.layers.{}.norm_conv", il);
// conv
{
auto * x = cur;
x = ggml_mul_mat(ctx0, layer.conv_pw1_w, x);
x = ggml_add(ctx0, x, layer.conv_pw1_b);
cb(x, "conformer.layers.{}.conv.pointwise_conv1", il);
// ggml_glu doesn't support sigmoid
// TODO @ngxson : support this ops in ggml
{
int64_t d = x->ne[0] / 2;
ggml_tensor * gate = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0]));
x = ggml_mul(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate);
x = ggml_cont(ctx0, ggml_transpose(ctx0, x));
}
// use ggml_ssm_conv for f32 precision
x = ggml_pad(ctx0, x, 4, 0, 0, 0);
x = ggml_roll(ctx0, x, 4, 0, 0, 0);
x = ggml_pad(ctx0, x, 4, 0, 0, 0);
x = ggml_ssm_conv(ctx0, x, layer.conv_dw_w);
x = ggml_add(ctx0, x, layer.conv_dw_b);
x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b);
x = ggml_silu(ctx0, x);
// pointwise_conv2
x = ggml_mul_mat(ctx0, layer.conv_pw2_w, x);
x = ggml_add(ctx0, x, layer.conv_pw2_b);
cur = x;
}
residual = ggml_add(ctx0, residual, cur);
cur = build_norm(residual, layer.ff_norm_1_w, layer.ff_norm_1_b, NORM_TYPE_NORMAL, 1e-5, il);
cb(cur, "conformer.layers.{}.norm_feed_forward2", il);
cur = build_ffn(cur, layer.ff_up_1_w, layer.ff_up_1_b, nullptr, nullptr, layer.ff_down_1_w, layer.ff_down_1_b,
FFN_SILU, il); // TODO(tarek): read activation for ffn from hparams
cb(cur, "conformer.layers.{}.feed_forward2.linear2", il);
residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor));
cb(residual, "conformer.layers.{}.conv.id", il);
cur = build_norm(residual, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, 1e-5, il);
cb(cur, "conformer.layers.{}.norm_out", il);
}
// audio adapter
cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
cb(cur, "audio_adapter.model.{}", 0);
cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_3_w, model.mm_3_b, FFN_GELU_ERF, -1);
cb(cur, "projected", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}

View File

@@ -1,451 +0,0 @@
#include "models.h"
// Helpers for MobileNetV5 Blocks
// RMS Norm 2D - normalizes over channels for each spatial position
ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) {
// inp: [W, H, C, B]
ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3);
cur = ggml_cont(ctx0, cur);
cur = ggml_rms_norm(ctx0, cur, eps);
if (weight) {
cur = ggml_mul(ctx0, cur, weight);
}
cur = ggml_permute(ctx0, cur, 2, 1, 0, 3);
cur = ggml_cont(ctx0, cur);
return cur;
}
// Conv2dSame padding - asymmetric SAME padding like PyTorch/TF
ggml_tensor* clip_graph_mobilenetv5::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) {
const int64_t ih = inp->ne[1]; // height
const int64_t iw = inp->ne[0]; // width
// Calculate output size (ceil division)
const int64_t oh = (ih + stride_h - 1) / stride_h;
const int64_t ow = (iw + stride_w - 1) / stride_w;
// Calculate padding needed
const int64_t pad_h = std::max((int64_t)0, (oh - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - ih);
const int64_t pad_w = std::max((int64_t)0, (ow - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - iw);
// Split padding asymmetrically
const int pad_h_top = pad_h / 2;
const int pad_h_bottom = pad_h - pad_h_top;
const int pad_w_left = pad_w / 2;
const int pad_w_right = pad_w - pad_w_left;
// Apply padding if needed
// ggml_pad_ext: (ctx, tensor, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3)
// For [W, H, C, B]: p0=width, p1=height, p2=channels, p3=batch
if (pad_h > 0 || pad_w > 0) {
inp = ggml_pad_ext(ctx0, inp,
pad_w_left, pad_w_right, // width padding (dim 0)
pad_h_top, pad_h_bottom, // height padding (dim 1)
0, 0, // no channel padding (dim 2)
0, 0); // no batch padding (dim 3)
}
return inp;
}
// Edge Residual Block (Stage 0)
ggml_tensor * clip_graph_mobilenetv5::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
ggml_tensor * cur = inp;
// 1. Expansion Conv (3x3)
if (stride == 2) {
// Case: Downsampling (Block 0)
// Replicates Conv2dSame(kernel=3, stride=2)
cur = pad_same_2d(cur, 3, 3, stride, stride);
cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1);
} else {
// Case: Normal 3x3 Block (Block 1, 2)
// Replicates Conv2d(kernel=3, stride=1, padding=1)
cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1);
}
// BN + Activation
if (block.s0_bn1_w) cur = rms_norm_2d(cur, block.s0_bn1_w);
cur = ggml_gelu(ctx0, cur);
// 2. Pointwise Linear Conv (1x1)
// 1x1 Convs usually have padding=0 and stride=1
cur = ggml_conv_2d_direct(ctx0, block.s0_conv_pwl_w, cur, 1, 1, 0, 0, 1, 1);
if (block.s0_bn2_w) cur = rms_norm_2d(cur, block.s0_bn2_w);
// 3. Residual Connection
// Only apply residual if spatial dimensions and channels match (stride 1)
if (stride == 1 && inp->ne[2] == cur->ne[2] && inp->ne[0] == cur->ne[0]) {
cur = ggml_add(ctx0, cur, inp);
}
return cur;
}
// Universal Inverted Residual Block (Stage 1+)
ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
ggml_tensor * cur = inp;
// 1. Depthwise Start (Optional)
// NOTE: dw_start always has stride=1 (no downsampling here)
if (block.dw_start_w) {
int k = block.dw_start_w->ne[0]; // 3 or 5
int p = k / 2;
cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1);
if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w);
}
// 2. Pointwise Expansion (1x1)
if (block.pw_exp_w) {
// Standard 1x1 conv, pad=0, stride=1
cur = ggml_conv_2d_direct(ctx0, block.pw_exp_w, cur, 1, 1, 0, 0, 1, 1);
if (block.pw_exp_bn_w) cur = rms_norm_2d(cur, block.pw_exp_bn_w);
cur = ggml_gelu(ctx0, cur);
}
// 3. Depthwise Mid (Optional)
// NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage)
if (block.dw_mid_w) {
int k = block.dw_mid_w->ne[0]; // 3 or 5
if (stride > 1) {
// Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding
cur = pad_same_2d(cur, k, k, stride, stride);
cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, 0, 0, 1, 1); // pad=0
} else {
// Case: Stride 1 -> Use Standard Symmetric Padding
int p = k / 2;
cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, p, p, 1, 1);
}
if (block.dw_mid_bn_w) cur = rms_norm_2d(cur, block.dw_mid_bn_w);
cur = ggml_gelu(ctx0, cur);
}
// 4. Pointwise Projection (1x1)
if (block.pw_proj_w) {
cur = ggml_conv_2d_direct(ctx0, block.pw_proj_w, cur, 1, 1, 0, 0, 1, 1);
if (block.pw_proj_bn_w) cur = rms_norm_2d(cur, block.pw_proj_bn_w);
}
// Apply Layer Scaling if present
if (block.layer_scale_w) {
cur = ggml_mul(ctx0, cur, block.layer_scale_w);
}
// 5. Residual Connection
bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]);
bool same_channel = (inp->ne[2] == cur->ne[2]);
if (same_spatial && same_channel) {
cur = ggml_add(ctx0, cur, inp);
}
return cur;
}
// Attention Block (MQA)
ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) {
ggml_tensor * cur = inp;
// Norm
if (block.attn_norm_w) {
cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f);
}
// 1. Q Calculation
ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1);
// 2. K Calculation (Downsampled)
// Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
ggml_tensor * k_inp = cur;
if (block.attn_k_dw_w) {
int k_size = block.attn_k_dw_w->ne[0]; // Usually 3
k_inp = pad_same_2d(cur, k_size, k_size, 2, 2); // Apply SAME padding
k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1); // padding=0
if (block.attn_k_norm_w) {
k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f);
}
}
ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1);
// 3. V Calculation (Downsampled)
// Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
ggml_tensor * v_inp = cur;
if (block.attn_v_dw_w) {
int v_size = block.attn_v_dw_w->ne[0]; // Usually 3
v_inp = pad_same_2d(cur, v_size, v_size, 2, 2); // Apply SAME padding
v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1); // padding=0
if (block.attn_v_norm_w) {
v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f);
}
}
ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1);
const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3];
const int D = k->ne[2]; // Head dimension
const int n_head = q->ne[2] / D;
const int N = W * H;
// Process Q: [W, H, D*n_head, B] -> [D, N, n_head, B]
q = ggml_reshape_3d(ctx0, q, N, D*n_head, B);
q = ggml_reshape_4d(ctx0, q, N, D, n_head, B);
q = ggml_permute(ctx0, q, 1, 0, 2, 3); // [D, N, n_head, B]
q = ggml_cont(ctx0, q);
const int Wk = k->ne[0]; const int Hk = k->ne[1];
const int M = Wk * Hk;
// Process K: [Wk, Hk, D, B] -> [D, M, 1, B]
k = ggml_reshape_3d(ctx0, k, M, D, B);
k = ggml_reshape_4d(ctx0, k, M, D, 1, B);
k = ggml_permute(ctx0, k, 1, 0, 2, 3); // [D, M, 1, B]
k = ggml_cont(ctx0, k);
// Process V: [Wk, Hk, D, B] -> [M, D, 1, B]
v = ggml_reshape_3d(ctx0, v, M, D, B);
v = ggml_reshape_4d(ctx0, v, M, D, 1, B);
v = ggml_cont(ctx0, v); // [M, D, 1, B]
// Multi-Query Attention
float scale = 1.0f / sqrtf((float)D);
// Step 1: Compute Q @ K.T
ggml_tensor * scores = ggml_mul_mat(ctx0, k, q);
scores = ggml_scale(ctx0, scores, scale);
scores = ggml_soft_max(ctx0, scores);
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores);
kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3);
kqv = ggml_cont(ctx0, kqv);
kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B);
kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B);
kqv = ggml_cont(ctx0, kqv);
// Output projection
cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1);
// Residual & Layer Scale
if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) {
if (block.layer_scale_w) {
cur = ggml_mul(ctx0, cur, block.layer_scale_w);
}
cur = ggml_add(ctx0, cur, inp);
}
return cur;
}
ggml_cgraph * clip_graph_mobilenetv5::build() {
ggml_tensor * inp = build_inp_raw();
// 1. Stem - Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2))
ggml_tensor * cur = pad_same_2d(inp, 3, 3, 2, 2); // Apply SAME padding
cur = ggml_conv_2d_direct(ctx0, model.mobilenet_stem_conv_w, cur, 2, 2, 0, 0, 1, 1); // padding=0
if (model.mobilenet_stem_conv_b) {
cur = ggml_add(ctx0, cur, model.mobilenet_stem_conv_b);
}
if (model.mobilenet_stem_norm_w) cur = rms_norm_2d(cur, model.mobilenet_stem_norm_w);
cur = ggml_gelu(ctx0, cur);
// 2. Blocks
std::vector<ggml_tensor*> intermediate_features;
const int total_blocks = model.mobilenet_blocks.size();
auto is_stage_start = [&](int i) {
if (i == 0) return true;
for (int end_idx : model.mobilenet_stage_ends) {
if (i == end_idx + 1) return true;
}
return false;
};
auto is_fusion_point = [&](int i) {
if (model.mobilenet_stage_ends.size() >= 4) {
if (i == model.mobilenet_stage_ends[2]) return true; // End of Stage 2
if (i == model.mobilenet_stage_ends[3]) return true; // End of Stage 3
} else {
if (i == total_blocks - 1) return true;
}
return false;
};
for (int i = 0; i < total_blocks; i++) {
const auto & block = model.mobilenet_blocks[i];
int stride = is_stage_start(i) ? 2 : 1;
if (block.s0_conv_exp_w) cur = build_edge_residual(cur, block, stride);
else if (block.attn_q_w) cur = build_mobilenet_attn(cur, block);
else cur = build_inverted_residual(cur, block, stride);
if (is_fusion_point(i)) {
intermediate_features.push_back(cur);
}
}
// 3. Multi-Scale Fusion Adapter (MSFA)
if (!intermediate_features.empty()) {
// A. Reference Resolution: PyTorch implementation uses inputs[0]
// We assume intermediate_features[0] is the "High Resolution" target.
// In MobileNet designs, this is typically the feature map with the smallest stride (e.g. 32x32).
ggml_tensor* target_feat = intermediate_features[0];
int high_res_w = target_feat->ne[0];
int high_res_h = target_feat->ne[1];
std::vector<ggml_tensor*> resized_feats;
// B. Resize inputs to match inputs[0] (High Resolution)
for (auto feat : intermediate_features) {
int feat_w = feat->ne[0];
int feat_h = feat->ne[1];
// PyTorch: if feat_size < high_resolution: interpolate
if (feat_w < high_res_w || feat_h < high_res_h) {
// Calculate scale factor.
// Note: PyTorch 'nearest' works on arbitrary float scales.
// ggml_upscale generally takes integer factors or target sizes depending on helper.
// Assuming standard power-of-2 scaling (e.g. 16 -> 32 means scale=2).
int scale_w = high_res_w / feat_w;
// int scale_h = high_res_h / feat_h;
// Safety check for non-integer scaling if strictly replicating
GGML_ASSERT(high_res_w % feat_w == 0);
// Upsample (Nearest Neighbor)
// 2 is the scale factor
feat = ggml_upscale(ctx0, feat, scale_w, ggml_scale_mode::GGML_SCALE_MODE_NEAREST);
}
resized_feats.push_back(feat);
}
// C. Concatenate at High Resolution (Channel Dim = 2 in ggml)
cur = resized_feats[0];
for (size_t k = 1; k < resized_feats.size(); ++k) {
cur = ggml_concat(ctx0, cur, resized_feats[k], 2);
}
// D. FFN (UniversalInvertedResidual)
// Structure: Expand Conv -> Norm -> GELU -> Project Conv -> Norm
// 1. Expansion
if (model.msfa_ffn_expand_w) {
// 1x1 Conv
cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_expand_w, cur, 1, 1, 0, 0, 1, 1);
if (model.msfa_ffn_expand_bn) {
cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn);
}
cur = ggml_gelu(ctx0, cur);
}
// 2. Projection (No DW because kernel_size=0)
if (model.msfa_ffn_project_w) {
// 1x1 Conv
cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_project_w, cur, 1, 1, 0, 0, 1, 1);
// UniversalInvertedResidual typically has a norm after projection
if (model.msfa_ffn_project_bn) {
cur = rms_norm_2d(cur, model.msfa_ffn_project_bn);
}
}
// E. Final Downsample to Target Resolution (Output Resolution)
// PyTorch: matches self.output_resolution (e.g. 16x16)
const int target_out_res = 16;
int current_w = cur->ne[0];
if (current_w > target_out_res) {
int s = current_w / target_out_res;
GGML_ASSERT(current_w % target_out_res == 0);
// Avg Pool: Kernel=s, Stride=s
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0);
}
// F. Final Norm
if (model.msfa_concat_norm_w) {
cur = rms_norm_2d(cur, model.msfa_concat_norm_w);
}
}
// 4. Gemma 3n Multimodal Projection (Embedder)
// Input: 'cur' is [Width, Height, Channels, Batch]
int W = cur->ne[0];
int H = cur->ne[1];
int C = cur->ne[2];
int B = cur->ne[3];
GGML_ASSERT(C == hparams.n_embd);
// 1. Permute and Flatten to [Channels, Tokens, Batch]
// PyTorch expects (Batch, Seq, Hidden), GGML usually processes (Hidden, Seq, Batch)
cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); // -> [C, H, W, B]
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // -> [C, W, H, B]
cur = ggml_cont(ctx0, cur);
cur = ggml_reshape_3d(ctx0, cur, C, W*H, B);
cur = ggml_cont(ctx0, cur);
// 2. FEATURE SCALING
// PyTorch: vision_outputs *= self.config.vision_config.hidden_size**0.5
const float scale_factor = sqrtf((float)C);
cur = ggml_scale(ctx0, cur, scale_factor);
// 3. SOFT EMBEDDING NORM
// PyTorch: self._norm(x) * self.weight
// We must normalize regardless, then multiply if weight exists.
{
const float eps = 1e-6f; // Gemma3n uses 1e-6
cur = ggml_rms_norm(ctx0, cur, eps);
if (model.mm_soft_emb_norm_w) {
// Weight shape is (2048,) -> Element-wise broadcast multiply
cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
}
}
// 4. PROJECTION
// PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False)
// Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size]
if (model.mm_input_proj_w) {
cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);
}
// 5. POST PROJECTION NORM
// PyTorch: embedding_post_projection_norm = Gemma3nRMSNorm(..., with_scale=False)
// with_scale=False means weight is registered as buffer with value 1.0
// So output = rms_norm(x) * 1.0 = rms_norm(x), magnitude ~1
{
const float eps = 1e-6f;
cur = ggml_rms_norm(ctx0, cur, eps);
if (model.mm_post_proj_norm_w) {
// If weight is loaded, multiply (should be ~1.0 anyway)
cur = ggml_mul(ctx0, cur, model.mm_post_proj_norm_w);
}
}
ggml_build_forward_expand(gf, cur);
return gf;
}

View File

@@ -2,11 +2,6 @@
#include "../clip-graph.h" #include "../clip-graph.h"
/*
* IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
* We encourage human contributors to ensure the quality and reliability of the codebase.
*/
struct clip_graph_siglip : clip_graph { struct clip_graph_siglip : clip_graph {
clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override; ggml_cgraph * build() override;
@@ -27,11 +22,6 @@ struct clip_graph_qwen3vl : clip_graph {
ggml_cgraph * build() override; ggml_cgraph * build() override;
}; };
struct clip_graph_youtuvl : clip_graph {
clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_minicpmv : clip_graph { struct clip_graph_minicpmv : clip_graph {
clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override; ggml_cgraph * build() override;
@@ -67,45 +57,7 @@ struct clip_graph_whisper_enc : clip_graph {
ggml_cgraph * build() override; ggml_cgraph * build() override;
}; };
struct clip_graph_conformer : clip_graph {
clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_glm4v : clip_graph { struct clip_graph_glm4v : clip_graph {
clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override; ggml_cgraph * build() override;
}; };
struct clip_graph_mobilenetv5 : clip_graph {
clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
ggml_tensor * rms_norm_2d(
ggml_tensor * inp,
ggml_tensor * weight,
float eps = 1e-6f);
ggml_tensor* pad_same_2d(
ggml_tensor* inp,
int kernel_h,
int kernel_w,
int stride_h,
int stride_w,
int dilation_h = 1,
int dilation_w = 1);
ggml_tensor * build_edge_residual(
ggml_tensor * inp,
const mobilenetv5_block & block,
int stride);
ggml_tensor * build_inverted_residual(
ggml_tensor * inp,
const mobilenetv5_block & block,
int stride);
ggml_tensor * build_mobilenet_attn(
ggml_tensor * inp,
const mobilenetv5_block & block);
};

View File

@@ -50,15 +50,10 @@ ggml_cgraph * clip_graph_siglip::build() {
const int scale_factor = model.hparams.n_merge; const int scale_factor = model.hparams.n_merge;
cur = build_patch_merge_permute(cur, scale_factor); cur = build_patch_merge_permute(cur, scale_factor);
// projection, in LFM2-VL input norm is optional // projection
if (model.mm_input_norm_w) { cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w); cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
}
if (model.mm_input_norm_b) {
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
}
cur = build_ffn(cur, cur = build_ffn(cur,
model.mm_1_w, model.mm_1_b, model.mm_1_w, model.mm_1_b,

View File

@@ -86,15 +86,6 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
FFN_GELU_ERF, FFN_GELU_ERF,
-1); -1);
} else if (proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
// projector
cur = build_ffn(cur,
model.mm_1_w, model.mm_1_b,
nullptr, nullptr,
model.mm_2_w, model.mm_2_b,
FFN_GELU_ERF,
-1);
} else if (proj_type == PROJECTOR_TYPE_GLMA) { } else if (proj_type == PROJECTOR_TYPE_GLMA) {
cur = ggml_norm(ctx0, cur, hparams.eps); cur = ggml_norm(ctx0, cur, hparams.eps);
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w); cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);

View File

@@ -1,179 +0,0 @@
#include "models.h"
ggml_cgraph * clip_graph_youtuvl::build() {
GGML_ASSERT(model.class_embedding == nullptr);
const int batch_size = 1;
const bool use_window_attn = !hparams.wa_layer_indexes.empty();
const int n_pos = n_patches;
const int num_position_ids = n_pos * 4;
const int m = 2;
const int Wp = n_patches_x;
const int Hp = n_patches_y;
const int Hm = Hp / m;
const int Wm = Wp / m;
norm_type norm_t = NORM_TYPE_NORMAL;
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
ggml_tensor * inp = build_inp_raw();
// change conv3d to linear
// reshape and permute to get patches, permute from (patch_size, m, Wm, patch_size, m, Hm, C) to (C, patch_size, patch_size, m, m, Wm, Hm)
{
inp = ggml_reshape_4d(
ctx0, inp,
Wm * m * patch_size, m * patch_size, Hm, 3);
inp = ggml_permute(ctx0, inp, 1, 2, 3, 0);
inp = ggml_cont_4d(
ctx0, inp,
m * patch_size * 3, Wm, m * patch_size, Hm);
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
inp = ggml_cont_4d(
ctx0, inp,
m * patch_size * 3, patch_size, m, Hm * Wm);
inp = ggml_permute(ctx0, inp, 1, 0, 2, 3);
inp = ggml_cont_4d(
ctx0, inp,
patch_size, 3, patch_size, Hm * Wm * m * m);
inp = ggml_permute(ctx0, inp, 2, 0, 1, 3);
inp = ggml_cont_3d(
ctx0, inp,
3*patch_size* patch_size, Hm * Wm * m * m, 1);
}
inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
if (model.patch_bias) {
inp = ggml_add(ctx0, inp, model.patch_bias);
}
inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
ggml_tensor * inpL = inp;
ggml_tensor * window_mask = nullptr;
ggml_tensor * window_idx = nullptr;
ggml_tensor * inv_window_idx = nullptr;
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
ggml_set_name(positions, "positions");
ggml_set_input(positions);
// pre-layernorm
if (model.pre_ln_w) {
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
}
if (use_window_attn) {
inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
ggml_set_name(inv_window_idx, "inv_window_idx");
ggml_set_input(inv_window_idx);
// mask for window attention
window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
ggml_set_name(window_mask, "window_mask");
ggml_set_input(window_mask);
// if flash attn is used, we need to pad the mask and cast to f16
if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
}
// inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
GGML_ASSERT(batch_size == 1);
inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
}
// loop over layers
for (int il = 0; il < n_layer; il++) {
const auto & layer = model.layers[il];
const bool full_attn = use_window_attn ? hparams.wa_layer_indexes.count(il) > 0 : true;
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
// layernorm1
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
// self-attention
{
ggml_tensor * Qcur = ggml_add(ctx0,
ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
ggml_tensor * Kcur = ggml_add(ctx0,
ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
ggml_tensor * Vcur = ggml_add(ctx0,
ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
Qcur = ggml_rope_multi(
ctx0, Qcur, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
Kcur = ggml_rope_multi(
ctx0, Kcur, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
cur = build_attn(layer.o_w, layer.o_b,
Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
}
// re-add the layer input, e.g., residual
cur = ggml_add(ctx0, cur, inpL);
inpL = cur; // inpL = residual, cur = hidden_states
// layernorm2
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
// ffn
cur = build_ffn(cur,
layer.ff_up_w, layer.ff_up_b,
nullptr, nullptr,
layer.ff_down_w, layer.ff_down_b,
hparams.ffn_op, il);
// residual 2
cur = ggml_add(ctx0, inpL, cur);
inpL = cur;
}
ggml_tensor * embeddings = inpL;
if (use_window_attn) {
const int spatial_merge_unit = 4;
window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / spatial_merge_unit);
ggml_set_name(window_idx, "window_idx");
ggml_set_input(window_idx);
GGML_ASSERT(batch_size == 1);
embeddings = ggml_reshape_2d(ctx0, embeddings, n_embd * spatial_merge_unit, n_patches / spatial_merge_unit);
embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd, n_patches, batch_size);
cb(embeddings, "window_order_restored", -1);
}
// post-layernorm (part of Siglip2VisionTransformer, applied after encoder)
if (model.post_ln_w) {
embeddings = build_norm(embeddings, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
}
// Now apply merger (VLPatchMerger):
// 1. Apply RMS norm (ln_q in VLPatchMerger)
embeddings = build_norm(embeddings, model.mm_input_norm_w, nullptr, NORM_TYPE_RMS, 1e-6, -1);
cb(embeddings, "merger_normed", -1);
// 2. First reshape for spatial merge (merge 2x2 patches)
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
cb(embeddings, "merger_reshaped", -1);
embeddings = build_ffn(embeddings,
model.mm_0_w, model.mm_0_b,
nullptr, nullptr,
model.mm_1_w, model.mm_1_b,
FFN_GELU,
-1);
ggml_build_forward_expand(gf, embeddings);
return gf;
}

View File

@@ -9,250 +9,207 @@
#include <fstream> #include <fstream>
#include <algorithm> #include <algorithm>
// some of the code here is copied from whisper.cpp // most of the code here is copied from whisper.cpp
constexpr bool DEBUG = false; constexpr bool DEBUG = false;
void mtmd_audio_cache::fill_sin_cos_table(int n) { struct mtmd_audio_mel_filters {
sin_vals.resize(n); int32_t n_mel;
cos_vals.resize(n); int32_t n_fft;
for (int i = 0; i < n; i++) {
double theta = (2 * M_PI * i) / n;
sin_vals[i] = sinf(theta);
cos_vals[i] = cosf(theta);
}
}
void mtmd_audio_cache::fill_hann_window(int length, bool periodic) { std::vector<float> data;
hann_window.resize(length); };
int offset = -1;
if (periodic) {
offset = 0;
}
for (int i = 0; i < length; i++) {
hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
}
}
void mtmd_audio_cache::fill_mel_filterbank_matrix(int n_mel, // note: this global cache is shared among all preprocessors
int n_fft, // if we want to use multiple preprocessors at the same time,
int sample_rate, // we will need to enclose it in the preprocessor class in the future
float fmin, static struct mtmd_audio_global_cache {
float fmax, // precomputed sin/cos table for FFT
bool slaney_area_norm, std::vector<float> sin_vals;
float scale) { std::vector<float> cos_vals;
GGML_ASSERT(n_mel > 0 && n_fft > 1);
if (fmax <= 0.0f) {
fmax = 0.5f * sample_rate;
}
// Slaney scale (matches librosa default) // hann window
const double min_log_hz = 1000.0; std::vector<float> hann_window;
const double lin_slope = 3 / 200.;
const double min_log_mel = min_log_hz * lin_slope;
const double log_step = log(6.4) / 27.0;
auto hz_to_mel = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double {
return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step;
};
auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double {
return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step);
};
// infer N_fft from n_fft_bins // mel filter bank
const double bin_hz_step = double(sample_rate) / double(n_fft); mtmd_audio_mel_filters filters;
// mel grid: n_mel + 2 edges void fill_sin_cos_table(int n) {
const double m_lo = hz_to_mel(fmin); sin_vals.resize(n);
const double m_hi = hz_to_mel(fmax); cos_vals.resize(n);
std::vector<double> mel_pts(n_mel + 2); for (int i = 0; i < n; i++) {
for (int i = 0; i < n_mel + 2; ++i) { double theta = (2 * M_PI * i) / n;
mel_pts[i] = m_lo + (m_hi - m_lo) * (double(i) / (n_mel + 1)); sin_vals[i] = sinf(theta);
} cos_vals[i] = cosf(theta);
// convert to Hz
std::vector<double> hz_pts(n_mel + 2);
for (int i = 0; i < n_mel + 2; ++i) {
hz_pts[i] = mel_to_hz(mel_pts[i]);
}
const int n_fft_bins = n_fft / 2 + 1;
// filterbank
std::vector<float> out(n_mel * n_fft_bins, 0);
for (int m = 0; m < n_mel; ++m) {
const double f_left = hz_pts[m];
const double f_center = hz_pts[m + 1];
const double f_right = hz_pts[m + 2];
const double denom_l = std::max(1e-30, f_center - f_left);
const double denom_r = std::max(1e-30, f_right - f_center);
const double enorm = slaney_area_norm ? (2.0 / std::max(1e-30, f_right - f_left)) : 1.0;
for (int k = 0; k < n_fft_bins; ++k) {
const double f = k * bin_hz_step;
double w = 0.0;
if (f >= f_left && f <= f_center) {
w = (f - f_left) / denom_l;
} else if (f > f_center && f <= f_right) {
w = (f_right - f) / denom_r;
}
out[size_t(m) * size_t(n_fft_bins) + size_t(k)] = float(w * enorm * scale);
} }
} }
filters.n_mel = n_mel; void fill_hann_window(int length, bool periodic) {
filters.n_fft = n_fft; hann_window.resize(length);
filters.data = std::move(out); int offset = -1;
if (periodic) {
offset = 0;
}
for (int i = 0; i < length; i++) {
hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
}
}
if (DEBUG) { // debug // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
for (size_t i = 0; i < filters.data.size(); ++i) { // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
if (filters.data[i] != 0.0f) { void fill_mel_filterbank_matrix(
printf("filters[%zu] = %f\n", i, filters.data[i] * 1000.0f); int n_mel,
int n_fft,
int sample_rate, // e.g. 16000
float fmin = 0.0f, // e.g. 0.0
float fmax = -1.0f, // e.g. sr/2; pass -1 for auto
bool slaney_area_norm = true,
float scale = 1.0f // optional extra scaling; use 1.0f/1000.0f to mimic your code
) {
GGML_ASSERT(n_mel > 0 && n_fft > 1);
if (fmax <= 0.0f) {
fmax = 0.5f * sample_rate;
}
// Slaney scale (matches librosa default)
const double min_log_hz = 1000.0;
const double lin_slope = 3 / 200.;
const double min_log_mel = min_log_hz * lin_slope;
const double log_step = log(6.4) / 27.0;
auto hz_to_mel = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double {
return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step;
};
auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double {
return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step);
};
// infer N_fft from n_fft_bins
const double bin_hz_step = double(sample_rate) / double(n_fft);
// mel grid: n_mel + 2 edges
const double m_lo = hz_to_mel(fmin);
const double m_hi = hz_to_mel(fmax);
std::vector<double> mel_pts(n_mel + 2);
for (int i = 0; i < n_mel + 2; ++i) {
mel_pts[i] = m_lo + (m_hi - m_lo) * (double(i) / (n_mel + 1));
}
// convert to Hz
std::vector<double> hz_pts(n_mel + 2);
for (int i = 0; i < n_mel + 2; ++i) {
hz_pts[i] = mel_to_hz(mel_pts[i]);
}
const int n_fft_bins = n_fft / 2 + 1;
// filterbank
std::vector<float> out(n_mel * n_fft_bins, 0);
for (int m = 0; m < n_mel; ++m) {
const double f_left = hz_pts[m];
const double f_center = hz_pts[m + 1];
const double f_right = hz_pts[m + 2];
const double denom_l = std::max(1e-30, f_center - f_left);
const double denom_r = std::max(1e-30, f_right - f_center);
const double enorm = slaney_area_norm ? (2.0 / std::max(1e-30, f_right - f_left)) : 1.0;
for (int k = 0; k < n_fft_bins; ++k) {
const double f = k * bin_hz_step;
double w = 0.0;
if (f >= f_left && f <= f_center) {
w = (f - f_left) / denom_l;
} else if (f > f_center && f <= f_right) {
w = (f_right - f) / denom_r;
}
out[size_t(m) * size_t(n_fft_bins) + size_t(k)] = float(w * enorm * scale);
}
}
filters.n_mel = n_mel;
filters.n_fft = n_fft;
filters.data = std::move(out);
if (DEBUG) { // debug
for (size_t i = 0; i < filters.data.size(); ++i) {
if (filters.data[i] != 0.0f) {
printf("filters[%zu] = %f\n", i, filters.data[i] * 1000.0f);
}
} }
} }
} }
} } g_cache;
// Unified DFT implementation for both forward and inverse transforms // naive Discrete Fourier Transform
// Template parameters: // input is real-valued
// Inverse: false = DFT with exp(-2πi·k·n/N), no scaling // output is complex-valued
// true = IDFT with exp(+2πi·k·n/N), scales by 1/N static void dft(const float * in, int N, float * out) {
// RealInput: true = input is real-valued (stride 1), avoids imaginary computations const int n_sin_cos_vals = g_cache.sin_vals.size();
// false = input is complex-valued (interleaved real/imag, stride 2) const int sin_cos_step = n_sin_cos_vals / N;
template <bool Inverse, bool RealInput>
static void dft_impl(const mtmd_audio_cache & cache, const float * in, int N, float * out) {
const int n_sin_cos_vals = cache.sin_vals.size();
const int sin_cos_step = n_sin_cos_vals / N;
constexpr float sign = Inverse ? 1.0f : -1.0f;
const float scale = Inverse ? (1.0f / N) : 1.0f;
for (int k = 0; k < N; k++) { for (int k = 0; k < N; k++) {
float re = 0; float re = 0;
float im = 0; float im = 0;
for (int n = 0; n < N; n++) { for (int n = 0; n < N; n++) {
int idx = (k * n * sin_cos_step) % n_sin_cos_vals; int idx = (k * n * sin_cos_step) % (n_sin_cos_vals); // t = 2*M_PI*k*n/N
float cos_val = cache.cos_vals[idx]; re += in[n] * g_cache.cos_vals[idx]; // cos(t)
float sin_val = cache.sin_vals[idx]; im -= in[n] * g_cache.sin_vals[idx]; // sin(t)
if constexpr (RealInput) {
// Real input: in_im = 0, simplifies to:
// re += in_re * cos_val
// im += sign * in_re * sin_val
float in_re = in[n];
re += in_re * cos_val;
im += sign * in_re * sin_val;
} else {
float in_re = in[n * 2 + 0];
float in_im = in[n * 2 + 1];
// (a + bi) * (cos + sign*i*sin) = (a*cos - sign*b*sin) + (sign*a*sin + b*cos)i
re += in_re * cos_val - sign * in_im * sin_val;
im += sign * in_re * sin_val + in_im * cos_val;
}
} }
out[k * 2 + 0] = re * scale; out[k*2 + 0] = re;
out[k * 2 + 1] = im * scale; out[k*2 + 1] = im;
} }
} }
// Cooley-Tukey FFT/IFFT unified implementation // Cooley-Tukey FFT
// Template parameters: // poor man's implementation - use something better
// Inverse: false = FFT with exp(-2πi·k/N), no scaling // input is real-valued
// true = IFFT with exp(+2πi·k/N), scales by 0.5 at each level // output is complex-valued
// RealInput: true = input is real-valued (stride 1) static void fft(float * in, int N, float * out) {
// false = input is complex-valued (interleaved real/imag, stride 2) const int n_sin_cos_vals = g_cache.sin_vals.size();
template <bool Inverse, bool RealInput>
static void fft_impl(const mtmd_audio_cache & cache, float * in, int N, float * out) {
const int n_sin_cos_vals = cache.sin_vals.size();
if (N == 1) { if (N == 1) {
out[0] = in[0]; out[0] = in[0];
if constexpr (RealInput) { out[1] = 0;
out[1] = 0.0f;
} else {
out[1] = in[1];
}
return; return;
} }
const int half_N = N / 2; const int half_N = N / 2;
if (N - half_N * 2 == 1) { if (N - half_N*2 == 1) {
// Odd N: fall back to DFT dft(in, N, out);
dft_impl<Inverse, RealInput>(cache, in, N, out);
return; return;
} }
// Split into even and odd float* even = in + N;
if constexpr (RealInput) { for (int i = 0; i < half_N; ++i) {
// Real input: stride is 1, copy only real values even[i]= in[2*i];
float * even = in + N;
for (int i = 0; i < half_N; ++i) {
even[i] = in[2 * i];
}
float * even_fft = out + 2 * N;
fft_impl<Inverse, true>(cache, even, half_N, even_fft);
float * odd = even;
for (int i = 0; i < half_N; ++i) {
odd[i] = in[2 * i + 1];
}
float * odd_fft = even_fft + N;
fft_impl<Inverse, true>(cache, odd, half_N, odd_fft);
} else {
// Complex input: stride is 2, copy complex pairs
float * even = in + N * 2;
for (int i = 0; i < half_N; ++i) {
even[i * 2 + 0] = in[2 * i * 2 + 0];
even[i * 2 + 1] = in[2 * i * 2 + 1];
}
float * even_fft = out + 2 * N;
fft_impl<Inverse, false>(cache, even, half_N, even_fft);
float * odd = even;
for (int i = 0; i < half_N; ++i) {
odd[i * 2 + 0] = in[(2 * i + 1) * 2 + 0];
odd[i * 2 + 1] = in[(2 * i + 1) * 2 + 1];
}
float * odd_fft = even_fft + N;
fft_impl<Inverse, false>(cache, odd, half_N, odd_fft);
} }
float* even_fft = out + 2 * N;
fft(even, half_N, even_fft);
float * even_fft = out + 2 * N; float* odd = even;
float * odd_fft = even_fft + N; for (int i = 0; i < half_N; ++i) {
odd[i] = in[2*i + 1];
}
float* odd_fft = even_fft + N;
fft(odd, half_N, odd_fft);
const int sin_cos_step = n_sin_cos_vals / N; const int sin_cos_step = n_sin_cos_vals / N;
constexpr float sign = Inverse ? 1.0f : -1.0f;
constexpr float scale = Inverse ? 0.5f : 1.0f;
for (int k = 0; k < half_N; k++) { for (int k = 0; k < half_N; k++) {
int idx = k * sin_cos_step; // t = 2*M_PI*k/N int idx = k * sin_cos_step; // t = 2*M_PI*k/N
float re = cache.cos_vals[idx]; float re = g_cache.cos_vals[idx]; // cos(t)
float im = sign * cache.sin_vals[idx]; float im = -g_cache.sin_vals[idx]; // sin(t)
float re_odd = odd_fft[2 * k + 0]; float re_odd = odd_fft[2*k + 0];
float im_odd = odd_fft[2 * k + 1]; float im_odd = odd_fft[2*k + 1];
out[2 * k + 0] = scale * (even_fft[2 * k + 0] + re * re_odd - im * im_odd); out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd;
out[2 * k + 1] = scale * (even_fft[2 * k + 1] + re * im_odd + im * re_odd); out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd;
out[2 * (k + half_N) + 0] = scale * (even_fft[2 * k + 0] - re * re_odd + im * im_odd); out[2*(k + half_N) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd;
out[2 * (k + half_N) + 1] = scale * (even_fft[2 * k + 1] - re * im_odd - im * re_odd); out[2*(k + half_N) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd;
} }
} }
// Forward FFT for real input (used by mel spectrogram)
static void fft(const mtmd_audio_cache & cache, float * in, int N, float * out) {
fft_impl<false, true>(cache, in, N, out);
}
// Inverse FFT for complex input
static void ifft(const mtmd_audio_cache & cache, float * in, int N, float * out) {
fft_impl<true, false>(cache, in, N, out);
}
struct filter_params { struct filter_params {
int32_t n_mel; int32_t n_mel;
int32_t n_fft_bins; int32_t n_fft_bins;
@@ -265,27 +222,20 @@ struct filter_params {
bool norm_per_feature = false; bool norm_per_feature = false;
}; };
static void log_mel_spectrogram_worker_thread(int ith, static void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::vector<float> & samples,
const float * hann, int n_samples, int frame_size, int frame_step, int n_threads,
const std::vector<float> & samples, const filter_params & params, mtmd_audio_mel & out) {
int n_samples,
int frame_size,
int frame_step,
int n_threads,
const filter_params & params,
const mtmd_audio_cache & cache,
mtmd_audio_mel & out) {
std::vector<float> fft_in(frame_size * 2, 0.0); std::vector<float> fft_in(frame_size * 2, 0.0);
std::vector<float> fft_out(frame_size * 2 * 2 * 2); std::vector<float> fft_out(frame_size * 2 * 2 * 2);
int n_fft_bins = params.n_fft_bins; int n_fft_bins = params.n_fft_bins;
int i = ith; int i = ith;
const auto & filters = cache.filters; const auto & filters = g_cache.filters;
// make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist // make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist
GGML_ASSERT(n_fft_bins == 1 + (frame_size / 2)); GGML_ASSERT(n_fft_bins == 1 + (frame_size / 2));
GGML_ASSERT(cache.sin_vals.size() == cache.cos_vals.size()); GGML_ASSERT(g_cache.sin_vals.size() == g_cache.cos_vals.size());
// calculate FFT only when fft_in are not all zero // calculate FFT only when fft_in are not all zero
for (; i < std::min(n_samples / frame_step + 1, out.n_len); i += n_threads) { for (; i < std::min(n_samples / frame_step + 1, out.n_len); i += n_threads) {
const int offset = i * frame_step; const int offset = i * frame_step;
@@ -301,7 +251,7 @@ static void log_mel_spectrogram_worker_thread(int ith,
} }
// FFT // FFT
fft(cache, fft_in.data(), frame_size, fft_out.data()); fft(fft_in.data(), frame_size, fft_out.data());
// Calculate modulus^2 of complex numbers // Calculate modulus^2 of complex numbers
// Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting. // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting.
@@ -348,7 +298,6 @@ static bool log_mel_spectrogram(
const int n_samples_in, const int n_samples_in,
const int n_threads, const int n_threads,
const filter_params & params, const filter_params & params,
const mtmd_audio_cache & cache,
mtmd_audio_mel & out) { mtmd_audio_mel & out) {
//const int64_t t_start_us = ggml_time_us(); //const int64_t t_start_us = ggml_time_us();
@@ -356,9 +305,9 @@ static bool log_mel_spectrogram(
int n_samples = n_samples_in; int n_samples = n_samples_in;
// Hann window // Hann window
const float * hann = cache.hann_window.data(); const float * hann = g_cache.hann_window.data();
const int frame_size = (params.n_fft_bins - 1) * 2; const int frame_size = (params.n_fft_bins - 1) * 2;
const int frame_step = params.hop_length; const int frame_step = params.hop_length;
// Padding // Padding
std::vector<float> samples_padded; std::vector<float> samples_padded;
@@ -386,9 +335,9 @@ static bool log_mel_spectrogram(
// preemphasis // preemphasis
if (params.preemph) { if (params.preemph) {
const int pad_amount = frame_size / 2; const int pad_amount = frame_size / 2;
const float preemph = 0.97f; const float preemph = 0.97f;
float prev = samples_padded[pad_amount]; float prev = samples_padded[pad_amount];
for (int i = pad_amount + 1; i + pad_amount < n_samples; ++i) { for (int i = pad_amount + 1; i + pad_amount < n_samples; ++i) {
float cur = samples_padded[i]; float cur = samples_padded[i];
samples_padded[i] = cur - preemph * prev; samples_padded[i] = cur - preemph * prev;
@@ -423,14 +372,14 @@ static bool log_mel_spectrogram(
{ {
std::vector<std::thread> workers(n_threads - 1); std::vector<std::thread> workers(n_threads - 1);
for (int iw = 0; iw < n_threads - 1; ++iw) { for (int iw = 0; iw < n_threads - 1; ++iw) {
workers[iw] = workers[iw] = std::thread(
std::thread(log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded), n_samples, log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded),
frame_size, frame_step, n_threads, std::cref(params), std::cref(cache), std::ref(out)); n_samples, frame_size, frame_step, n_threads,
std::cref(params), std::ref(out));
} }
// main thread // main thread
log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples, frame_size, frame_step, n_threads, params, log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples, frame_size, frame_step, n_threads, params, out);
cache, out);
for (int iw = 0; iw < n_threads - 1; ++iw) { for (int iw = 0; iw < n_threads - 1; ++iw) {
workers[iw].join(); workers[iw].join();
} }
@@ -455,7 +404,7 @@ static bool log_mel_spectrogram(
for (int j = 0; j < effective_n_len; ++j) { for (int j = 0; j < effective_n_len; ++j) {
auto &value = out.data[i * out.n_len + j]; auto &value = out.data[i * out.n_len + j];
value = (value - mean) / mstd; value = (value - mean) / mstd;
} }
// pad the rest with zeros // pad the rest with zeros
@@ -501,14 +450,18 @@ static bool log_mel_spectrogram(
// //
void mtmd_audio_preprocessor_whisper::initialize() { void mtmd_audio_preprocessor_whisper::initialize() {
cache.fill_sin_cos_table(hparams.audio_n_fft); g_cache.fill_sin_cos_table(hparams.audio_n_fft);
cache.fill_hann_window(hparams.audio_window_len, true); g_cache.fill_hann_window(hparams.audio_window_len, true);
cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate); g_cache.fill_mel_filterbank_matrix(
hparams.n_mel_bins,
hparams.audio_n_fft,
hparams.audio_sample_rate);
} }
bool mtmd_audio_preprocessor_whisper::preprocess(const float * samples, bool mtmd_audio_preprocessor_whisper::preprocess(
size_t n_samples, const float * samples,
std::vector<mtmd_audio_mel> & output) { size_t n_samples,
std::vector<mtmd_audio_mel> & output) {
if (n_samples == 0) { if (n_samples == 0) {
// empty audio // empty audio
return false; return false;
@@ -518,7 +471,7 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s
// if input is too short, pad with zeros // if input is too short, pad with zeros
// this is to avoid potential issues with stage1/2 padding in log_mel_spectrogram // this is to avoid potential issues with stage1/2 padding in log_mel_spectrogram
// TODO: maybe handle this better // TODO: maybe handle this better
size_t min_samples = (size_t) hparams.audio_sample_rate * (hparams.audio_chunk_len + 1); // +1 second margin size_t min_samples = (size_t)hparams.audio_sample_rate * (hparams.audio_chunk_len + 1); // +1 second margin
if (n_samples < min_samples) { if (n_samples < min_samples) {
smpl.resize(min_samples, 0.0f); smpl.resize(min_samples, 0.0f);
std::memcpy(smpl.data(), samples, n_samples * sizeof(float)); std::memcpy(smpl.data(), samples, n_samples * sizeof(float));
@@ -533,19 +486,22 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s
params.hop_length = hparams.audio_hop_len; params.hop_length = hparams.audio_hop_len;
params.sample_rate = hparams.audio_sample_rate; params.sample_rate = hparams.audio_sample_rate;
params.center_padding = false; params.center_padding = false;
params.preemph = 0.0f; // disabled params.preemph = 0.0f; // disabled
params.use_natural_log = false; params.use_natural_log = false;
params.norm_per_feature = false; params.norm_per_feature = false;
// make sure the cache is initialized // make sure the global cache is initialized
GGML_ASSERT(!cache.sin_vals.empty()); GGML_ASSERT(!g_cache.sin_vals.empty());
GGML_ASSERT(!cache.cos_vals.empty()); GGML_ASSERT(!g_cache.cos_vals.empty());
GGML_ASSERT(!cache.filters.data.empty()); GGML_ASSERT(!g_cache.filters.data.empty());
mtmd_audio_mel out_full; mtmd_audio_mel out_full;
bool ok = log_mel_spectrogram(samples, n_samples, bool ok = log_mel_spectrogram(
4, // n_threads samples,
params, cache, out_full); n_samples,
4, // n_threads
params,
out_full);
if (!ok) { if (!ok) {
return false; return false;
} }
@@ -556,21 +512,21 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s
printf("output: n_mel = %d, n_len = %d\n", out_full.n_mel, out_full.n_len); printf("output: n_mel = %d, n_len = %d\n", out_full.n_mel, out_full.n_len);
} }
const size_t frames_per_chunk = 3000; const size_t frames_per_chunk = 3000;
GGML_ASSERT((size_t) out_full.n_len > frames_per_chunk); GGML_ASSERT((size_t)out_full.n_len > frames_per_chunk);
for (size_t off = 0; off < (size_t) out_full.n_len; off += frames_per_chunk) { for (size_t off = 0; off < (size_t)out_full.n_len; off += frames_per_chunk) {
int n_len = std::min(frames_per_chunk, (size_t) out_full.n_len - off); int n_len = std::min(frames_per_chunk, (size_t)out_full.n_len - off);
if ((size_t) n_len < frames_per_chunk) { if ((size_t)n_len < frames_per_chunk) {
break; // last uncomplete chunk will always be a padded chunk, safe to ignore break; // last uncomplete chunk will always be a padded chunk, safe to ignore
} }
mtmd_audio_mel out_chunk; mtmd_audio_mel out_chunk;
out_chunk.n_len = n_len; out_chunk.n_len = n_len;
out_chunk.n_mel = out_full.n_mel; out_chunk.n_mel = out_full.n_mel;
out_chunk.n_len_org = out_full.n_mel; // unused out_chunk.n_len_org = out_full.n_mel; // unused
out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len); out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len);
for (int i = 0; i < out_full.n_mel; i++) { for (int i = 0; i < out_full.n_mel; i++) {
auto src = out_full.data.begin() + i * out_full.n_len + off; auto src = out_full.data.begin() + i*out_full.n_len + off;
out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk); out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk);
} }
@@ -579,152 +535,3 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s
return true; return true;
} }
//
// mtmd_audio_preprocessor_conformer
//
void mtmd_audio_preprocessor_conformer::initialize() {
cache.fill_sin_cos_table(hparams.audio_n_fft);
cache.fill_hann_window(hparams.audio_window_len, true);
cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate);
}
bool mtmd_audio_preprocessor_conformer::preprocess(const float * samples,
size_t n_samples,
std::vector<mtmd_audio_mel> & output) {
// empty audio
if (n_samples == 0) {
return false;
}
filter_params params;
params.n_mel = hparams.n_mel_bins;
params.n_fft_bins = 1 + (hparams.audio_n_fft / 2);
params.hann_window_size = hparams.audio_window_len;
params.hop_length = hparams.audio_hop_len;
params.sample_rate = hparams.audio_sample_rate;
params.center_padding = true;
params.preemph = 0.97f;
params.use_natural_log = true;
params.norm_per_feature = true;
// make sure the cache is initialized
GGML_ASSERT(!cache.sin_vals.empty());
GGML_ASSERT(!cache.cos_vals.empty());
GGML_ASSERT(!cache.filters.data.empty());
mtmd_audio_mel out_full;
bool ok = log_mel_spectrogram(samples, n_samples,
4, // n_threads
params, cache, out_full);
if (!ok) {
return false;
}
output.push_back(std::move(out_full));
return true;
}
//
// mtmd_audio_streaming_istft implementation
//
mtmd_audio_streaming_istft::mtmd_audio_streaming_istft(int n_fft, int hop_length) :
n_fft(n_fft),
hop_length(hop_length),
n_fft_bins(n_fft / 2 + 1),
overlap_buffer(n_fft, 0.0f),
window_sum_buffer(n_fft, 0.0f),
padding_to_remove((n_fft - hop_length) / 2),
ifft_in(n_fft * 2 * 4, 0.0f), // extra space for recursive IFFT
ifft_out(n_fft * 2 * 4, 0.0f) {
cache.fill_sin_cos_table(n_fft);
cache.fill_hann_window(n_fft, true);
}
void mtmd_audio_streaming_istft::reset() {
std::fill(overlap_buffer.begin(), overlap_buffer.end(), 0.0f);
std::fill(window_sum_buffer.begin(), window_sum_buffer.end(), 0.0f);
padding_to_remove = (n_fft - hop_length) / 2;
}
std::vector<float> mtmd_audio_streaming_istft::process_frame(const float * frame_spectrum) {
std::vector<float> output(hop_length);
// copy frequencies
for (int j = 0; j < n_fft_bins; j++) {
ifft_in[j * 2 + 0] = frame_spectrum[j * 2 + 0];
ifft_in[j * 2 + 1] = frame_spectrum[j * 2 + 1];
}
// mirror negative frequencies
for (int j = 1; j < n_fft_bins - 1; j++) {
int mirror_idx = n_fft - j;
ifft_in[mirror_idx * 2 + 0] = ifft_in[j * 2 + 0];
ifft_in[mirror_idx * 2 + 1] = -ifft_in[j * 2 + 1]; // conjugate
}
ifft(cache, ifft_in.data(), n_fft, ifft_out.data());
// update window sum and overlap buffer
for (int j = 0; j < n_fft; j++) {
window_sum_buffer[j] += cache.hann_window[j] * cache.hann_window[j];
overlap_buffer[j] += ifft_out[j * 2] * cache.hann_window[j];
}
// extract hop_length samples with normalization
for (int i = 0; i < hop_length; i++) {
if (window_sum_buffer[i] > 1e-8f) {
output[i] = overlap_buffer[i] / window_sum_buffer[i];
} else {
output[i] = overlap_buffer[i];
}
}
// shift buffers left by hop_length
std::copy(overlap_buffer.begin() + hop_length, overlap_buffer.end(), overlap_buffer.begin());
std::fill(overlap_buffer.end() - hop_length, overlap_buffer.end(), 0.0f);
std::copy(window_sum_buffer.begin() + hop_length, window_sum_buffer.end(), window_sum_buffer.begin());
std::fill(window_sum_buffer.end() - hop_length, window_sum_buffer.end(), 0.0f);
// Remove padding if needed
int to_remove = std::min(padding_to_remove, (int) output.size());
padding_to_remove -= to_remove;
output.erase(output.begin(), output.begin() + to_remove);
return output;
}
std::vector<float> mtmd_audio_streaming_istft::flush() {
std::vector<float> output;
// Extract remaining samples from overlap buffer
// Continue until we've extracted all meaningful samples
int remaining = n_fft - hop_length;
while (remaining > 0) {
int chunk_size = std::min(remaining, hop_length);
for (int i = 0; i < chunk_size; i++) {
float sample;
if (window_sum_buffer[i] > 1e-8f) {
sample = overlap_buffer[i] / window_sum_buffer[i];
} else {
sample = overlap_buffer[i];
}
output.push_back(sample);
}
// Shift buffers
std::copy(overlap_buffer.begin() + chunk_size, overlap_buffer.end(), overlap_buffer.begin());
std::fill(overlap_buffer.end() - chunk_size, overlap_buffer.end(), 0.0f);
std::copy(window_sum_buffer.begin() + chunk_size, window_sum_buffer.end(), window_sum_buffer.begin());
std::fill(window_sum_buffer.end() - chunk_size, window_sum_buffer.end(), 0.0f);
remaining -= chunk_size;
}
return output;
}

View File

@@ -17,38 +17,6 @@ struct mtmd_audio_mel {
std::vector<float> data; std::vector<float> data;
}; };
struct mtmd_audio_mel_filters {
int32_t n_mel;
int32_t n_fft;
std::vector<float> data;
};
// cache for audio processing, each processor instance owns its own cache
struct mtmd_audio_cache {
std::vector<float> sin_vals;
std::vector<float> cos_vals;
std::vector<float> hann_window;
mtmd_audio_mel_filters filters;
void fill_sin_cos_table(int n);
void fill_hann_window(int length, bool periodic);
// Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
// n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
void fill_mel_filterbank_matrix(int n_mel,
int n_fft,
int sample_rate, // e.g. 16000
float fmin = 0.0f, // e.g. 0.0
float fmax = -1.0f, // e.g. sr/2; pass -1 for auto
bool slaney_area_norm = true,
float scale = 1.0f // optional extra scaling
);
};
struct mtmd_audio_preprocessor { struct mtmd_audio_preprocessor {
const clip_hparams & hparams; const clip_hparams & hparams;
@@ -63,51 +31,4 @@ struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {} mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
void initialize() override; void initialize() override;
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override; bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
private:
mtmd_audio_cache cache;
};
struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
void initialize() override;
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
private:
mtmd_audio_cache cache;
};
//
// streaming ISTFT - converts spectrogram frames back to audio one frame at a time
//
struct mtmd_audio_streaming_istft {
mtmd_audio_streaming_istft(int n_fft, int hop_length);
// reset streaming state
void reset();
// process a single STFT frame (streaming)
// frame_spectrum: [n_fft_bins x 2] interleaved real/imag
// returns: up to hop_length samples
std::vector<float> process_frame(const float * frame_spectrum);
// flush remaining samples at end of stream
std::vector<float> flush();
private:
int n_fft;
int hop_length;
int n_fft_bins;
// Own cache for output processing
mtmd_audio_cache cache;
// Streaming state
std::vector<float> overlap_buffer;
std::vector<float> window_sum_buffer;
int padding_to_remove;
// Working buffers for IFFT
std::vector<float> ifft_in;
std::vector<float> ifft_out;
}; };

View File

@@ -121,8 +121,6 @@ mtmd_context_params mtmd_context_params_default() {
/* warmup */ true, /* warmup */ true,
/* image_min_tokens */ -1, /* image_min_tokens */ -1,
/* image_max_tokens */ -1, /* image_max_tokens */ -1,
/* cb_eval */ nullptr,
/* cb_eval_user_data */ nullptr,
}; };
return params; return params;
} }
@@ -158,6 +156,8 @@ struct mtmd_context {
bool tok_row_end_trail = false; bool tok_row_end_trail = false;
bool ov_img_first = false; bool ov_img_first = false;
bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
// string template for slice image delimiters with row/col (idefics3) // string template for slice image delimiters with row/col (idefics3)
std::string sli_img_start_tmpl; std::string sli_img_start_tmpl;
@@ -188,8 +188,6 @@ struct mtmd_context {
/* image_min_tokens */ ctx_params.image_min_tokens, /* image_min_tokens */ ctx_params.image_min_tokens,
/* image_max_tokens */ ctx_params.image_max_tokens, /* image_max_tokens */ ctx_params.image_max_tokens,
/* warmup */ ctx_params.warmup, /* warmup */ ctx_params.warmup,
/* cb_eval */ ctx_params.cb_eval,
/* cb_eval_user_data */ ctx_params.cb_eval_user_data,
}; };
auto res = clip_init(mmproj_fname, ctx_clip_params); auto res = clip_init(mmproj_fname, ctx_clip_params);
@@ -229,6 +227,7 @@ struct mtmd_context {
void init_vision() { void init_vision() {
GGML_ASSERT(ctx_v != nullptr); GGML_ASSERT(ctx_v != nullptr);
use_mrope = clip_is_mrope(ctx_v);
projector_type proj = clip_get_projector_type(ctx_v); projector_type proj = clip_get_projector_type(ctx_v);
int minicpmv_version = clip_is_minicpmv(ctx_v); int minicpmv_version = clip_is_minicpmv(ctx_v);
@@ -277,7 +276,7 @@ struct mtmd_context {
} }
// set boi/eoi // set boi/eoi
if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) { if (proj == PROJECTOR_TYPE_GEMMA3) {
// <start_of_image> ... (image embeddings) ... <end_of_image> // <start_of_image> ... (image embeddings) ... <end_of_image>
img_beg = "<start_of_image>"; img_beg = "<start_of_image>";
img_end = "<end_of_image>"; img_end = "<end_of_image>";
@@ -294,7 +293,7 @@ struct mtmd_context {
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
img_end = "[IMG_END]"; img_end = "[IMG_END]";
} else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) { } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL) {
// <|vision_start|> ... (image embeddings) ... <|vision_end|> // <|vision_start|> ... (image embeddings) ... <|vision_end|>
img_beg = "<|vision_start|>"; img_beg = "<|vision_start|>";
img_end = "<|vision_end|>"; img_end = "<|vision_end|>";
@@ -340,13 +339,8 @@ struct mtmd_context {
case PROJECTOR_TYPE_QWEN25O: case PROJECTOR_TYPE_QWEN25O:
case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_GLMA:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a); audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
break; break;
case PROJECTOR_TYPE_LFM2A:
audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
break;
default: default:
GGML_ABORT("unsupported audio projector type"); GGML_ABORT("unsupported audio projector type");
} }
@@ -364,9 +358,6 @@ struct mtmd_context {
// [BEGIN_AUDIO] ... (embeddings) ... // [BEGIN_AUDIO] ... (embeddings) ...
aud_beg = "[BEGIN_AUDIO]"; aud_beg = "[BEGIN_AUDIO]";
} else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
// <sound> ... (embeddings) ...
aud_beg = "<sound>";
} }
} }
@@ -638,7 +629,7 @@ struct mtmd_tokenizer {
} }
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
if (mtmd_decode_use_mrope(ctx)) { if (ctx->use_mrope) {
// for Qwen2VL, we need this information for M-RoPE decoding positions // for Qwen2VL, we need this information for M-RoPE decoding positions
image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get()); image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get()); image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
@@ -873,24 +864,14 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
} }
bool mtmd_decode_use_non_causal(mtmd_context * ctx) { bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
switch (ctx->proj_type_v()) { if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
case PROJECTOR_TYPE_GEMMA3: return true;
return true;
default:
return false;
} }
return false;
} }
bool mtmd_decode_use_mrope(mtmd_context * ctx) { bool mtmd_decode_use_mrope(mtmd_context * ctx) {
switch (ctx->proj_type_v()) { return ctx->use_mrope;
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V:
return true;
default:
return false;
}
} }
bool mtmd_support_vision(mtmd_context * ctx) { bool mtmd_support_vision(mtmd_context * ctx) {

View File

@@ -27,9 +27,6 @@
* - Make sure the C API is aligned with the libllama C API (as in llama.h) * - Make sure the C API is aligned with the libllama C API (as in llama.h)
* - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead * - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead
* - Keep the API minimal, do not expose internal details unless necessary * - Keep the API minimal, do not expose internal details unless necessary
*
* IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
* We encourage human contributors to ensure the quality and reliability of the codebase.
*/ */
#ifdef LLAMA_SHARED #ifdef LLAMA_SHARED
@@ -98,10 +95,6 @@ struct mtmd_context_params {
// limit number of image tokens, only for vision models with dynamic resolution // limit number of image tokens, only for vision models with dynamic resolution
int image_min_tokens; // minimum number of tokens for image input (default: read from metadata) int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
int image_max_tokens; // maximum number of tokens for image input (default: read from metadata) int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
// callback function passed over to mtmd proper
ggml_backend_sched_eval_callback cb_eval;
void * cb_eval_user_data;
}; };
MTMD_API const char * mtmd_default_marker(void); MTMD_API const char * mtmd_default_marker(void);
@@ -227,7 +220,7 @@ MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
// get output embeddings from the last encode pass // get output embeddings from the last encode pass
// the reading size (in bytes) is equal to: // the reading size (in bytes) is equal to:
// llama_model_n_embd_inp(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float) // llama_model_n_embd(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float)
MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx); MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
// Set callback for all future logging events. // Set callback for all future logging events.
@@ -280,12 +273,12 @@ struct bitmap {
ptr.reset(mtmd_bitmap_init(nx, ny, data)); ptr.reset(mtmd_bitmap_init(nx, ny, data));
} }
~bitmap() = default; ~bitmap() = default;
uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); } uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); }
uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); } uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); }
const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); } const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); }
size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); } size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); }
std::string id() const { return mtmd_bitmap_get_id(ptr.get()); } std::string id() { return mtmd_bitmap_get_id(ptr.get()); }
void set_id(const char * id) const { mtmd_bitmap_set_id(ptr.get(), id); } void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
}; };
struct bitmaps { struct bitmaps {
@@ -309,8 +302,8 @@ struct input_chunks {
input_chunks() = default; input_chunks() = default;
input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {} input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
~input_chunks() = default; ~input_chunks() = default;
size_t size() const { return mtmd_input_chunks_size(ptr.get()); } size_t size() { return mtmd_input_chunks_size(ptr.get()); }
const mtmd_input_chunk * operator[](size_t idx) const { const mtmd_input_chunk * operator[](size_t idx) {
return mtmd_input_chunks_get(ptr.get(), idx); return mtmd_input_chunks_get(ptr.get(), idx);
} }
}; };

View File

@@ -23,7 +23,7 @@ problem.
8 files changed, 21 insertions(+), 2 deletions(-) 8 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 354876574..9e67c769a 100644 index 8547ecc84..9f37ca70c 100644
--- a/ggml/src/ggml-backend.cpp --- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp
@@ -112,7 +112,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { @@ -112,7 +112,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@@ -42,7 +42,7 @@ index 354876574..9e67c769a 100644
} }
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -2126,6 +2126,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { @@ -2125,6 +2125,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer); GGML_ASSERT(buffer);
ggml_aligned_free(buffer->context, buffer->size); ggml_aligned_free(buffer->context, buffer->size);
@@ -54,7 +54,7 @@ index 354876574..9e67c769a 100644
} }
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@@ -2178,7 +2183,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = { @@ -2177,7 +2182,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
}; };
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = { static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
@@ -64,10 +64,10 @@ index 354876574..9e67c769a 100644
/* .init_tensor = */ NULL, // no initialization required /* .init_tensor = */ NULL, // no initialization required
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index 42c6c67a4..db33e0bc0 100644 index da624c587..efc63e092 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp --- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -820,6 +820,7 @@ static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) { @@ -831,6 +831,7 @@ static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
static void ggml_backend_cann_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cann_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context; ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
delete ctx; delete ctx;
@@ -75,7 +75,7 @@ index 42c6c67a4..db33e0bc0 100644
} }
/** /**
@@ -1559,6 +1560,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf @@ -1570,6 +1571,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
*/ */
static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) { static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
ACL_CHECK(aclrtFreeHost(buffer->context)); ACL_CHECK(aclrtFreeHost(buffer->context));
@@ -84,7 +84,7 @@ index 42c6c67a4..db33e0bc0 100644
/** /**
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index e9df0ea4a..290d762ad 100644 index ab0f6fe9c..6519af435 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -583,6 +583,7 @@ struct ggml_backend_cuda_buffer_context { @@ -583,6 +583,7 @@ struct ggml_backend_cuda_buffer_context {
@@ -112,7 +112,7 @@ index e9df0ea4a..290d762ad 100644
static void * ggml_cuda_host_malloc(size_t size) { static void * ggml_cuda_host_malloc(size_t size) {
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
index 56b59f0af..790cabca0 100644 index 70bf6f3d9..f2b7fe692 100644
--- a/ggml/src/ggml-metal/ggml-metal.cpp --- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp +++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -25,6 +25,7 @@ static void ggml_backend_metal_buffer_shared_free_buffer(ggml_backend_buffer_t b @@ -25,6 +25,7 @@ static void ggml_backend_metal_buffer_shared_free_buffer(ggml_backend_buffer_t b
@@ -132,10 +132,10 @@ index 56b59f0af..790cabca0 100644
static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 678e40965..0b3914ce6 100644 index 0d37587f6..ff373d413 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp --- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -3675,6 +3675,7 @@ struct ggml_backend_opencl_buffer_context { @@ -3417,6 +3417,7 @@ struct ggml_backend_opencl_buffer_context {
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
delete ctx; delete ctx;
@@ -144,10 +144,10 @@ index 678e40965..0b3914ce6 100644
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index d7c8ad8c1..281fa1bdb 100644 index 18a45d2d9..89041805e 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp --- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -557,6 +557,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) { @@ -556,6 +556,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0); bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
RPC_STATUS_ASSERT(status); RPC_STATUS_ASSERT(status);
delete ctx; delete ctx;
@@ -156,7 +156,7 @@ index d7c8ad8c1..281fa1bdb 100644
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index ce2f0d41c..3d5924105 100644 index e996d98be..84b679315 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp --- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -356,6 +356,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try { @@ -356,6 +356,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
@@ -175,19 +175,19 @@ index ce2f0d41c..3d5924105 100644
} }
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -1175,6 +1177,7 @@ inline void free_aligned_mem_host(void * memblock) { @@ -1159,6 +1161,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free_aligned_mem_host((void *)buffer->context); ggml_sycl_host_free(buffer->context);
+ delete buffer; + delete buffer;
} }
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index b5e5dba95..cc9b38b54 100644 index 34ec09d40..120191ca0 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -12859,6 +12859,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { @@ -12365,6 +12365,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_vk_destroy_buffer(ctx->dev_buffer); ggml_vk_destroy_buffer(ctx->dev_buffer);
delete ctx; delete ctx;
@@ -195,7 +195,7 @@ index b5e5dba95..cc9b38b54 100644
} }
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -13002,6 +13003,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe @@ -12508,6 +12509,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()"); VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
ggml_vk_host_free(vk_instance.devices[0], buffer->context); ggml_vk_host_free(vk_instance.devices[0], buffer->context);

View File

@@ -6,14 +6,14 @@ Subject: [PATCH] pretokenizer
allow for an unset pretokenizer with a warning in the allow for an unset pretokenizer with a warning in the
logs instead of throwing an error logs instead of throwing an error
--- ---
src/llama-vocab.cpp | 17 +++++------------ src/llama-vocab.cpp | 14 +++-----------
1 file changed, 5 insertions(+), 12 deletions(-) 1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index a23950d00..886ed637d 100644 index 7b01a2edf..63250cdf1 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -1839,16 +1839,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1825,16 +1825,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (type == LLAMA_VOCAB_TYPE_BPE) { if (type == LLAMA_VOCAB_TYPE_BPE) {
add_space_prefix = false; add_space_prefix = false;
clean_spaces = true; clean_spaces = true;
@@ -31,8 +31,8 @@ index a23950d00..886ed637d 100644
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if ( } else if (
tokenizer_pre == "llama3" || tokenizer_pre == "llama3" ||
@@ -2042,7 +2033,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -2015,7 +2006,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN; pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
clean_spaces = false; clean_spaces = false;
} else { } else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); - throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
@@ -41,20 +41,3 @@ index a23950d00..886ed637d 100644
} }
} else if (type == LLAMA_VOCAB_TYPE_SPM) { } else if (type == LLAMA_VOCAB_TYPE_SPM) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
@@ -2086,6 +2078,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
}
+ const uint32_t n_scores = score_idx != -1 ? gguf_get_arr_n(ctx, score_idx) : 0;
const int * toktypes = nullptr;
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
if (toktype_idx != -1) {
@@ -2107,7 +2100,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
auto & token_data = id_to_token[i];
token_data.text = std::move(word);
- token_data.score = scores ? scores[i] : 0.0f;
+ token_data.score = (scores && i < n_scores) ? scores[i] : 0.0f;
token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file

View File

@@ -6,11 +6,11 @@ Subject: [PATCH] clip-unicode
fixes loading vision models in llama.cpp on windows fixes loading vision models in llama.cpp on windows
filesystems for paths that include wide characters filesystems for paths that include wide characters
--- ---
tools/mtmd/clip.cpp | 47 +++++++++++++++++++++++++++++++++++++++++---- tools/mtmd/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++
1 file changed, 43 insertions(+), 4 deletions(-) 1 file changed, 39 insertions(+)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 9b076e0c5..18dab19df 100644 index 35e3aef0a..84a3796b5 100644
--- a/tools/mtmd/clip.cpp --- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp
@@ -24,6 +24,19 @@ @@ -24,6 +24,19 @@
@@ -33,7 +33,7 @@ index 9b076e0c5..18dab19df 100644
struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL}; struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
//#define CLIP_DEBUG_FUNCTIONS //#define CLIP_DEBUG_FUNCTIONS
@@ -1837,7 +1850,29 @@ struct clip_model_loader { @@ -1619,7 +1632,29 @@ struct clip_model_loader {
{ {
std::vector<uint8_t> read_buf; std::vector<uint8_t> read_buf;
@@ -63,7 +63,7 @@ index 9b076e0c5..18dab19df 100644
if (!fin) { if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str())); throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
} }
@@ -1864,7 +1899,11 @@ struct clip_model_loader { @@ -1646,7 +1681,11 @@ struct clip_model_loader {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
} }
} }
@@ -75,39 +75,3 @@ index 9b076e0c5..18dab19df 100644
LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str()); LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str());
} }
@@ -2247,7 +2286,7 @@ struct img_tool {
std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
dst.nx = target_resolution.width;
dst.ny = target_resolution.height;
- dst.buf.resize(3 * dst.nx * dst.ny);
+ dst.buf.resize(3 * static_cast<size_t>(dst.nx) * static_cast<size_t>(dst.ny));
if (dst.nx == src.nx && dst.ny == src.ny) {
// no resize needed, simple copy
@@ -2300,7 +2339,7 @@ struct img_tool {
static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
dst.nx = w;
dst.ny = h;
- dst.buf.resize(3 * w * h);
+ dst.buf.resize(3 * static_cast<size_t>(w) * static_cast<size_t>(h));
for (int i = 0; i < h; ++i) {
for (int j = 0; j < w; ++j) {
@@ -2397,7 +2436,7 @@ private:
static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
dst.nx = target_width;
dst.ny = target_height;
- dst.buf.resize(3 * target_width * target_height);
+ dst.buf.resize(3 * static_cast<size_t>(target_width) * static_cast<size_t>(target_height));
float x_ratio = static_cast<float>(src.nx - 1) / target_width;
float y_ratio = static_cast<float>(src.ny - 1) / target_height;
@@ -2436,7 +2475,7 @@ private:
dst.nx = target_width;
dst.ny = target_height;
- dst.buf.resize(3 * target_width * target_height);
+ dst.buf.resize(3 * static_cast<size_t>(target_width) * static_cast<size_t>(target_height));
float Cc;
float C[5] = {};

View File

@@ -19,10 +19,10 @@ adds support for the Solar Pro architecture
create mode 100644 src/models/solar.cpp create mode 100644 src/models/solar.cpp
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f337afd6b..b08cd324d 100644 index 4192af7c0..bd44d73e7 100644
--- a/src/CMakeLists.txt --- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt +++ b/src/CMakeLists.txt
@@ -131,6 +131,7 @@ add_library(llama @@ -125,6 +125,7 @@ add_library(llama
models/seed-oss.cpp models/seed-oss.cpp
models/smallthinker.cpp models/smallthinker.cpp
models/smollm3.cpp models/smollm3.cpp
@@ -31,10 +31,10 @@ index f337afd6b..b08cd324d 100644
models/starcoder.cpp models/starcoder.cpp
models/starcoder2.cpp models/starcoder2.cpp
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index a54bc1956..a62a03e14 100644 index 8caf80afc..2ce8ffec0 100644
--- a/src/llama-arch.cpp --- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp +++ b/src/llama-arch.cpp
@@ -90,6 +90,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { @@ -87,6 +87,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE_MOE, "granitemoe" }, { LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_GRANITE_HYBRID, "granitehybrid" }, { LLM_ARCH_GRANITE_HYBRID, "granitehybrid" },
{ LLM_ARCH_CHAMELEON, "chameleon" }, { LLM_ARCH_CHAMELEON, "chameleon" },
@@ -42,7 +42,7 @@ index a54bc1956..a62a03e14 100644
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_PLM, "plm" }, { LLM_ARCH_PLM, "plm" },
{ LLM_ARCH_BAILINGMOE, "bailingmoe" }, { LLM_ARCH_BAILINGMOE, "bailingmoe" },
@@ -216,6 +217,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { @@ -208,6 +209,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" }, { LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" }, { LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
{ LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" }, { LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" },
@@ -50,7 +50,7 @@ index a54bc1956..a62a03e14 100644
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
@@ -348,6 +350,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = { @@ -339,6 +341,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
@@ -58,9 +58,9 @@ index a54bc1956..a62a03e14 100644
{ LLM_TENSOR_POS_EMBD, "position_embd" }, { LLM_TENSOR_POS_EMBD, "position_embd" },
{ LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" }, { LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
@@ -2289,6 +2292,22 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) { @@ -2176,6 +2179,22 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_FFN_DOWN, return {
LLM_TENSOR_FFN_UP, LLM_TENSOR_TOKEN_EMBD,
}; };
+ case LLM_ARCH_SOLAR: + case LLM_ARCH_SOLAR:
+ return { + return {
@@ -81,7 +81,7 @@ index a54bc1956..a62a03e14 100644
default: default:
GGML_ABORT("unknown architecture for tensor mapping"); GGML_ABORT("unknown architecture for tensor mapping");
} }
@@ -2457,6 +2476,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = { @@ -2344,6 +2363,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
// this tensor is loaded for T5, but never used // this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@@ -90,10 +90,10 @@ index a54bc1956..a62a03e14 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h diff --git a/src/llama-arch.h b/src/llama-arch.h
index 270d28b16..d96470a0d 100644 index 6cbf9b1f8..14d461c76 100644
--- a/src/llama-arch.h --- a/src/llama-arch.h
+++ b/src/llama-arch.h +++ b/src/llama-arch.h
@@ -94,6 +94,7 @@ enum llm_arch { @@ -91,6 +91,7 @@ enum llm_arch {
LLM_ARCH_GRANITE_MOE, LLM_ARCH_GRANITE_MOE,
LLM_ARCH_GRANITE_HYBRID, LLM_ARCH_GRANITE_HYBRID,
LLM_ARCH_CHAMELEON, LLM_ARCH_CHAMELEON,
@@ -101,7 +101,7 @@ index 270d28b16..d96470a0d 100644
LLM_ARCH_WAVTOKENIZER_DEC, LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_PLM, LLM_ARCH_PLM,
LLM_ARCH_BAILINGMOE, LLM_ARCH_BAILINGMOE,
@@ -220,6 +221,7 @@ enum llm_kv { @@ -212,6 +213,7 @@ enum llm_kv {
LLM_KV_ATTENTION_OUTPUT_SCALE, LLM_KV_ATTENTION_OUTPUT_SCALE,
LLM_KV_ATTENTION_TEMPERATURE_LENGTH, LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
LLM_KV_ATTENTION_TEMPERATURE_SCALE, LLM_KV_ATTENTION_TEMPERATURE_SCALE,
@@ -109,7 +109,7 @@ index 270d28b16..d96470a0d 100644
LLM_KV_ATTENTION_KEY_LENGTH_MLA, LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA, LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@@ -474,6 +476,7 @@ enum llm_tensor { @@ -465,6 +467,7 @@ enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM, LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS, LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT, LLM_TENSOR_CLS_OUT,
@@ -118,10 +118,10 @@ index 270d28b16..d96470a0d 100644
LLM_TENSOR_CONVNEXT_DW, LLM_TENSOR_CONVNEXT_DW,
LLM_TENSOR_CONVNEXT_NORM, LLM_TENSOR_CONVNEXT_NORM,
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 392f9160c..14e089efb 100644 index fe1fa4341..aabff2f06 100644
--- a/src/llama-hparams.cpp --- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp
@@ -167,6 +167,14 @@ uint32_t llama_hparams::n_pos_per_embd() const { @@ -163,6 +163,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
return rope_type == LLAMA_ROPE_TYPE_MROPE || rope_type == LLAMA_ROPE_TYPE_IMROPE ? 4 : 1; return rope_type == LLAMA_ROPE_TYPE_MROPE || rope_type == LLAMA_ROPE_TYPE_IMROPE ? 4 : 1;
} }
@@ -137,10 +137,10 @@ index 392f9160c..14e089efb 100644
if (il < n_layer) { if (il < n_layer) {
return swa_layers[il]; return swa_layers[il];
diff --git a/src/llama-hparams.h b/src/llama-hparams.h diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index caed0ec1b..61a1fbef6 100644 index f6e95b5d2..c6e673276 100644
--- a/src/llama-hparams.h --- a/src/llama-hparams.h
+++ b/src/llama-hparams.h +++ b/src/llama-hparams.h
@@ -66,6 +66,8 @@ struct llama_hparams { @@ -65,6 +65,8 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr; std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr; std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
@@ -149,7 +149,7 @@ index caed0ec1b..61a1fbef6 100644
uint32_t n_layer_dense_lead = 0; uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0; uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0; uint32_t n_lora_kv = 0;
@@ -267,6 +269,9 @@ struct llama_hparams { @@ -259,6 +261,9 @@ struct llama_hparams {
uint32_t n_pos_per_embd() const; uint32_t n_pos_per_embd() const;
@@ -158,12 +158,12 @@ index caed0ec1b..61a1fbef6 100644
+ +
bool is_swa(uint32_t il) const; bool is_swa(uint32_t il) const;
// note: currently only support if either all or none of the layers are MLA bool has_kv(uint32_t il) const;
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 383b8dc76..c2e758737 100644 index ca2ea2461..8916a6242 100644
--- a/src/llama-model-loader.cpp --- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp
@@ -497,7 +497,7 @@ namespace GGUFMeta { @@ -466,7 +466,7 @@ namespace GGUFMeta {
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
@@ -173,10 +173,10 @@ index 383b8dc76..c2e758737 100644
llama_model_loader::llama_model_loader( llama_model_loader::llama_model_loader(
const std::string & fname, const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index cc784e1cb..c093207e0 100644 index ae8207ee1..00cd579e0 100644
--- a/src/llama-model.cpp --- a/src/llama-model.cpp
+++ b/src/llama-model.cpp +++ b/src/llama-model.cpp
@@ -2114,6 +2114,21 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -1995,6 +1995,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
@@ -198,7 +198,7 @@ index cc784e1cb..c093207e0 100644
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -5741,6 +5756,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) { @@ -5429,6 +5444,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@@ -233,7 +233,7 @@ index cc784e1cb..c093207e0 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -7981,6 +8024,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { @@ -7534,6 +7577,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{ {
llm = std::make_unique<llm_build_chameleon>(*this, params); llm = std::make_unique<llm_build_chameleon>(*this, params);
} break; } break;
@@ -244,7 +244,7 @@ index cc784e1cb..c093207e0 100644
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params); llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
@@ -8259,6 +8306,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { @@ -7798,6 +7845,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_GRANITE_HYBRID: case LLM_ARCH_GRANITE_HYBRID:
case LLM_ARCH_CHAMELEON: case LLM_ARCH_CHAMELEON:
@@ -253,10 +253,10 @@ index cc784e1cb..c093207e0 100644
case LLM_ARCH_NEO_BERT: case LLM_ARCH_NEO_BERT:
case LLM_ARCH_SMOLLM3: case LLM_ARCH_SMOLLM3:
diff --git a/src/llama-model.h b/src/llama-model.h diff --git a/src/llama-model.h b/src/llama-model.h
index d1de16e3f..e8452eda5 100644 index c6eb95318..b378b23ec 100644
--- a/src/llama-model.h --- a/src/llama-model.h
+++ b/src/llama-model.h +++ b/src/llama-model.h
@@ -80,6 +80,7 @@ enum llm_type { @@ -76,6 +76,7 @@ enum llm_type {
LLM_TYPE_15B, LLM_TYPE_15B,
LLM_TYPE_16B, LLM_TYPE_16B,
LLM_TYPE_20B, LLM_TYPE_20B,
@@ -264,7 +264,7 @@ index d1de16e3f..e8452eda5 100644
LLM_TYPE_26B, LLM_TYPE_26B,
LLM_TYPE_27B, LLM_TYPE_27B,
LLM_TYPE_30B, LLM_TYPE_30B,
@@ -411,6 +412,8 @@ struct llama_layer { @@ -405,6 +406,8 @@ struct llama_layer {
struct ggml_tensor * ffn_act_beta = nullptr; struct ggml_tensor * ffn_act_beta = nullptr;
struct ggml_tensor * ffn_act_eps = nullptr; struct ggml_tensor * ffn_act_eps = nullptr;
@@ -274,10 +274,10 @@ index d1de16e3f..e8452eda5 100644
struct llama_layer_convnext convnext; struct llama_layer_convnext convnext;
diff --git a/src/models/models.h b/src/models/models.h diff --git a/src/models/models.h b/src/models/models.h
index 3a44f7f14..eabe9c81c 100644 index ffb36acc6..6d84a185d 100644
--- a/src/models/models.h --- a/src/models/models.h
+++ b/src/models/models.h +++ b/src/models/models.h
@@ -544,6 +544,11 @@ struct llm_build_smollm3 : public llm_graph_context { @@ -515,6 +515,11 @@ struct llm_build_smollm3 : public llm_graph_context {
llm_build_smollm3(const llama_model & model, const llm_graph_params & params); llm_build_smollm3(const llama_model & model, const llm_graph_params & params);
}; };

View File

@@ -12,7 +12,7 @@ regex
2 files changed, 22 insertions(+), 1 deletion(-) 2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 886ed637d..923e850cb 100644 index 63250cdf1..dd86a1745 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { @@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
@@ -25,7 +25,7 @@ index 886ed637d..923e850cb 100644
"\\s+$", "\\s+$",
"[一-龥ࠀ-一가-퟿]+", "[一-龥ࠀ-一가-퟿]+",
diff --git a/src/unicode.cpp b/src/unicode.cpp diff --git a/src/unicode.cpp b/src/unicode.cpp
index b47dcbe61..6d1084f26 100644 index bb44edfad..13ced055f 100644
--- a/src/unicode.cpp --- a/src/unicode.cpp
+++ b/src/unicode.cpp +++ b/src/unicode.cpp
@@ -2,6 +2,11 @@ @@ -2,6 +2,11 @@

View File

@@ -11,10 +11,10 @@ with the fastest acceleration is loaded
1 file changed, 13 insertions(+), 8 deletions(-) 1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 6bee1bc4b..f3d371dcc 100644 index 4181a714a..079dba211 100644
--- a/ggml/src/ggml-backend-reg.cpp --- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp
@@ -167,7 +167,7 @@ struct ggml_backend_reg_entry { @@ -183,7 +183,7 @@ struct ggml_backend_reg_entry {
struct ggml_backend_registry { struct ggml_backend_registry {
std::vector<ggml_backend_reg_entry> backends; std::vector<ggml_backend_reg_entry> backends;
@@ -23,7 +23,7 @@ index 6bee1bc4b..f3d371dcc 100644
ggml_backend_registry() { ggml_backend_registry() {
#ifdef GGML_USE_CUDA #ifdef GGML_USE_CUDA
@@ -221,7 +221,7 @@ struct ggml_backend_registry { @@ -237,7 +237,7 @@ struct ggml_backend_registry {
} }
} }
@@ -32,7 +32,7 @@ index 6bee1bc4b..f3d371dcc 100644
if (!reg) { if (!reg) {
return; return;
} }
@@ -232,15 +232,20 @@ struct ggml_backend_registry { @@ -248,15 +248,20 @@ struct ggml_backend_registry {
#endif #endif
backends.push_back({ reg, std::move(handle) }); backends.push_back({ reg, std::move(handle) });
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
@@ -56,7 +56,7 @@ index 6bee1bc4b..f3d371dcc 100644
} }
ggml_backend_reg_t load_backend(const fs::path & path, bool silent) { ggml_backend_reg_t load_backend(const fs::path & path, bool silent) {
@@ -284,7 +289,7 @@ struct ggml_backend_registry { @@ -300,7 +305,7 @@ struct ggml_backend_registry {
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str()); GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str());
@@ -65,7 +65,7 @@ index 6bee1bc4b..f3d371dcc 100644
return reg; return reg;
} }
@@ -307,7 +312,7 @@ struct ggml_backend_registry { @@ -323,7 +328,7 @@ struct ggml_backend_registry {
// remove devices // remove devices
devices.erase( devices.erase(
std::remove_if(devices.begin(), devices.end(), std::remove_if(devices.begin(), devices.end(),
@@ -74,7 +74,7 @@ index 6bee1bc4b..f3d371dcc 100644
devices.end()); devices.end());
// remove backend // remove backend
@@ -365,7 +370,7 @@ size_t ggml_backend_dev_count() { @@ -381,7 +386,7 @@ size_t ggml_backend_dev_count() {
ggml_backend_dev_t ggml_backend_dev_get(size_t index) { ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
GGML_ASSERT(index < ggml_backend_dev_count()); GGML_ASSERT(index < ggml_backend_dev_count());

View File

@@ -8,7 +8,7 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
1 file changed, 2 insertions(+) 1 file changed, 2 insertions(+)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 6192a8704..993ec027f 100644 index 4c04c3300..f4747f262 100644
--- a/ggml/src/CMakeLists.txt --- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt
@@ -345,6 +345,7 @@ function(ggml_add_cpu_backend_variant tag_name) @@ -345,6 +345,7 @@ function(ggml_add_cpu_backend_variant tag_name)
@@ -26,4 +26,4 @@ index 6192a8704..993ec027f 100644
+ add_custom_target(ggml-cpu) + add_custom_target(ggml-cpu)
if (GGML_SYSTEM_ARCH STREQUAL "x86") if (GGML_SYSTEM_ARCH STREQUAL "x86")
ggml_add_cpu_backend_variant(x64) ggml_add_cpu_backend_variant(x64)
ggml_add_cpu_backend_variant(sse42 SSE42) ggml_add_cpu_backend_variant(sse42 SSE42)

View File

@@ -5,22 +5,21 @@ Subject: [PATCH] remove amx
disable amx as it reduces performance on some systems disable amx as it reduces performance on some systems
--- ---
ggml/src/CMakeLists.txt | 5 +---- ggml/src/CMakeLists.txt | 4 ----
1 file changed, 1 insertion(+), 4 deletions(-) 1 file changed, 4 deletions(-)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 993ec027f..cbda1380c 100644 index f4747f262..d55aed348 100644
--- a/ggml/src/CMakeLists.txt --- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt
@@ -379,10 +379,7 @@ if (GGML_CPU_ALL_VARIANTS) @@ -365,10 +365,6 @@ if (GGML_CPU_ALL_VARIANTS)
ggml_add_cpu_backend_variant(zen4 SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16) ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
endif() ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C FMA AVX2 BMI2 AVX_VNNI) ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
- if (NOT MSVC) - if (NOT MSVC)
- # MSVC doesn't support AMX - # MSVC doesn't support AMX
- ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8) - ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
- endif() - endif()
+ # AMX variants removed by ollama - sapphirerapids with AMX_TILE AMX_INT8 not included
elseif(GGML_SYSTEM_ARCH STREQUAL "ARM") elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
if (CMAKE_SYSTEM_NAME MATCHES "Linux") if (CMAKE_SYSTEM_NAME MATCHES "Linux")
# Many of these features are optional so we build versions with popular # Many of these features are optional so we build versions with popular

View File

@@ -25,10 +25,10 @@ index 79ee20206..3efb22f01 100644
// get ith C string from array with given key_id // get ith C string from array with given key_id
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i); GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
index ed0d7f2ca..db55f6ed1 100644 index b165d8bdc..f91d4faba 100644
--- a/ggml/src/gguf.cpp --- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp
@@ -813,10 +813,14 @@ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id @@ -805,10 +805,14 @@ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id
const void * gguf_get_arr_data(const struct gguf_context * ctx, int64_t key_id) { const void * gguf_get_arr_data(const struct gguf_context * ctx, int64_t key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
@@ -44,7 +44,7 @@ index ed0d7f2ca..db55f6ed1 100644
const char * gguf_get_arr_str(const struct gguf_context * ctx, int64_t key_id, size_t i) { const char * gguf_get_arr_str(const struct gguf_context * ctx, int64_t key_id, size_t i) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].get_type() == GGUF_TYPE_STRING); GGML_ASSERT(ctx->kv[key_id].get_type() == GGUF_TYPE_STRING);
@@ -910,7 +914,6 @@ const char * gguf_get_val_str(const struct gguf_context * ctx, int64_t key_id) { @@ -902,7 +906,6 @@ const char * gguf_get_val_str(const struct gguf_context * ctx, int64_t key_id) {
const void * gguf_get_val_data(const struct gguf_context * ctx, int64_t key_id) { const void * gguf_get_val_data(const struct gguf_context * ctx, int64_t key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].get_ne() == 1); GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
@@ -53,10 +53,10 @@ index ed0d7f2ca..db55f6ed1 100644
} }
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 923e850cb..0917191b5 100644 index dd86a1745..d63ce9c84 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -1795,9 +1795,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1781,9 +1781,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str()); const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
if (precompiled_charsmap_keyidx != -1) { if (precompiled_charsmap_keyidx != -1) {
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx); const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);

View File

@@ -8,19 +8,19 @@ Subject: [PATCH] ollama debug tensor
1 file changed, 6 insertions(+) 1 file changed, 6 insertions(+)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index b1de2ae87..42e892527 100644 index a59b51893..53891a91f 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c --- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -16,6 +16,8 @@ @@ -15,6 +15,8 @@
#include "ops.h"
#include "ggml.h" #include "ggml.h"
#include "common.h"
+#include "ollama-debug.h" +#include "ollama-debug.h"
+ +
#if defined(_MSC_VER) || defined(__MINGW32__) #if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW #include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -2952,6 +2954,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { @@ -2945,6 +2947,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
ggml_compute_forward(&params, node); ggml_compute_forward(&params, node);

View File

@@ -10,10 +10,10 @@ Subject: [PATCH] add ollama vocab for grammar support
3 files changed, 58 insertions(+), 10 deletions(-) 3 files changed, 58 insertions(+), 10 deletions(-)
diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
index 64ea2fd00..d87e52ded 100644 index 75d5d750c..a0299d181 100644
--- a/src/llama-grammar.cpp --- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp +++ b/src/llama-grammar.cpp
@@ -1079,6 +1079,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack( @@ -1041,6 +1041,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
struct llama_grammar * llama_grammar_init_impl( struct llama_grammar * llama_grammar_init_impl(
const struct llama_vocab * vocab, const struct llama_vocab * vocab,
@@ -21,7 +21,7 @@ index 64ea2fd00..d87e52ded 100644
const llama_grammar_element ** rules, const llama_grammar_element ** rules,
size_t n_rules, size_t n_rules,
size_t start_rule_index) { size_t start_rule_index) {
@@ -1134,6 +1135,7 @@ struct llama_grammar * llama_grammar_init_impl( @@ -1096,6 +1097,7 @@ struct llama_grammar * llama_grammar_init_impl(
// then the pointers would be invalidated when the local vec_rules goes out of scope. // then the pointers would be invalidated when the local vec_rules goes out of scope.
return new llama_grammar { return new llama_grammar {
vocab, vocab,
@@ -29,7 +29,7 @@ index 64ea2fd00..d87e52ded 100644
std::move(vec_rules), std::move(vec_rules),
std::move(stacks), std::move(stacks),
/* .partial_utf8 = */ {}, /* .partial_utf8 = */ {},
@@ -1148,6 +1150,7 @@ struct llama_grammar * llama_grammar_init_impl( @@ -1110,6 +1112,7 @@ struct llama_grammar * llama_grammar_init_impl(
struct llama_grammar * llama_grammar_init_impl( struct llama_grammar * llama_grammar_init_impl(
const struct llama_vocab * vocab, const struct llama_vocab * vocab,
@@ -37,7 +37,7 @@ index 64ea2fd00..d87e52ded 100644
const char * grammar_str, const char * grammar_str,
const char * grammar_root, const char * grammar_root,
bool lazy, bool lazy,
@@ -1240,6 +1243,7 @@ struct llama_grammar * llama_grammar_init_impl( @@ -1202,6 +1205,7 @@ struct llama_grammar * llama_grammar_init_impl(
// then the pointers would be invalidated when the local vec_rules goes out of scope. // then the pointers would be invalidated when the local vec_rules goes out of scope.
return new llama_grammar { return new llama_grammar {
vocab, vocab,
@@ -45,7 +45,7 @@ index 64ea2fd00..d87e52ded 100644
std::move(vec_rules), std::move(vec_rules),
std::move(stacks), std::move(stacks),
/* .partial_utf8 = */ {}, /* .partial_utf8 = */ {},
@@ -1263,6 +1267,7 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) { @@ -1225,6 +1229,7 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) { struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
auto * result = new llama_grammar { auto * result = new llama_grammar {
grammar.vocab, grammar.vocab,
@@ -53,7 +53,7 @@ index 64ea2fd00..d87e52ded 100644
grammar.rules, grammar.rules,
grammar.stacks, grammar.stacks,
grammar.partial_utf8, grammar.partial_utf8,
@@ -1291,7 +1296,6 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra @@ -1253,7 +1258,6 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
} }
void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * cur_p) { void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * cur_p) {
@@ -61,7 +61,7 @@ index 64ea2fd00..d87e52ded 100644
if (grammar.awaiting_trigger) { if (grammar.awaiting_trigger) {
return; return;
@@ -1313,9 +1317,13 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_ @@ -1275,9 +1279,13 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
for (size_t i = 0; i < cur_p->size; ++i) { for (size_t i = 0; i < cur_p->size; ++i) {
const llama_token id = cur_p->data[i].id; const llama_token id = cur_p->data[i].id;
@@ -77,7 +77,7 @@ index 64ea2fd00..d87e52ded 100644
if (!allow_eog) { if (!allow_eog) {
cur_p->data[i].logit = -INFINITY; cur_p->data[i].logit = -INFINITY;
} }
@@ -1334,9 +1342,10 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_ @@ -1296,9 +1304,10 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
} }
void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) { void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
@@ -90,7 +90,7 @@ index 64ea2fd00..d87e52ded 100644
if (grammar.awaiting_trigger) { if (grammar.awaiting_trigger) {
if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) { if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
@@ -1380,13 +1389,14 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token @@ -1353,13 +1362,14 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
} }
} }
@@ -107,7 +107,7 @@ index 64ea2fd00..d87e52ded 100644
} }
llama_grammar_accept_token(grammar, token, piece); llama_grammar_accept_token(grammar, token, piece);
@@ -1462,3 +1472,27 @@ void llama_grammar_accept_token(struct llama_grammar & grammar, llama_token toke @@ -1435,3 +1445,27 @@ void llama_grammar_accept_token(struct llama_grammar & grammar, llama_token toke
} }
} }
@@ -136,7 +136,7 @@ index 64ea2fd00..d87e52ded 100644
+ } + }
+} +}
diff --git a/src/llama-grammar.h b/src/llama-grammar.h diff --git a/src/llama-grammar.h b/src/llama-grammar.h
index b5a0e588e..57847583a 100644 index a4c978ac1..5c0da4049 100644
--- a/src/llama-grammar.h --- a/src/llama-grammar.h
+++ b/src/llama-grammar.h +++ b/src/llama-grammar.h
@@ -6,8 +6,19 @@ @@ -6,8 +6,19 @@
@@ -159,7 +159,7 @@ index b5a0e588e..57847583a 100644
// grammar element type // grammar element type
enum llama_gretype { enum llama_gretype {
@@ -129,6 +140,7 @@ struct llama_grammar { @@ -127,6 +138,7 @@ struct llama_grammar {
// note: allow null vocab for testing (not great) // note: allow null vocab for testing (not great)
const llama_vocab * vocab; const llama_vocab * vocab;
@@ -167,7 +167,7 @@ index b5a0e588e..57847583a 100644
const llama_grammar_rules rules; // TODO: shared ptr const llama_grammar_rules rules; // TODO: shared ptr
llama_grammar_stacks stacks; llama_grammar_stacks stacks;
@@ -157,12 +169,14 @@ struct llama_grammar { @@ -155,12 +167,14 @@ struct llama_grammar {
// note: needed for tests (not great) // note: needed for tests (not great)
struct llama_grammar * llama_grammar_init_impl( struct llama_grammar * llama_grammar_init_impl(
const struct llama_vocab * vocab, const struct llama_vocab * vocab,
@@ -183,10 +183,10 @@ index b5a0e588e..57847583a 100644
const char * grammar_root, const char * grammar_root,
bool lazy, bool lazy,
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 5dde51306..90f6f1b3d 100644 index 3f4a729bc..38a30ea05 100644
--- a/src/llama-sampling.cpp --- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp
@@ -2504,7 +2504,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) { @@ -1561,7 +1561,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str()); trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
} }
@@ -195,7 +195,7 @@ index 5dde51306..90f6f1b3d 100644
ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(), ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size()); ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
@@ -2586,9 +2586,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl( @@ -1639,9 +1639,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
trigger_pattern += ")[\\s\\S]*"; trigger_pattern += ")[\\s\\S]*";
std::array<const char *, 1> tmp_trigger_patterns = { trigger_pattern.c_str() }; std::array<const char *, 1> tmp_trigger_patterns = { trigger_pattern.c_str() };

View File

@@ -5,17 +5,17 @@ Subject: [PATCH] add argsort and cuda copy for i32
--- ---
ggml/src/ggml-cpu/ops.cpp | 43 ++++++ ggml/src/ggml-cpu/ops.cpp | 43 ++++++
ggml/src/ggml-cuda/argsort.cu | 120 +++++++++++++-- ggml/src/ggml-cuda/argsort.cu | 122 +++++++++++++--
ggml/src/ggml-cuda/cpy-utils.cuh | 6 + ggml/src/ggml-cuda/cpy-utils.cuh | 6 +
ggml/src/ggml-cuda/cpy.cu | 40 +++++ ggml/src/ggml-cuda/cpy.cu | 40 +++++
ggml/src/ggml-metal/ggml-metal.metal | 215 +++++++++++++++++++++++++++ ggml/src/ggml-metal/ggml-metal.metal | 215 +++++++++++++++++++++++++++
5 files changed, 413 insertions(+), 11 deletions(-) 5 files changed, 414 insertions(+), 12 deletions(-)
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 48c896436..c08e73f3c 100644 index 303278397..7d1733adb 100644
--- a/ggml/src/ggml-cpu/ops.cpp --- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp
@@ -7958,6 +7958,45 @@ static void ggml_compute_forward_argsort_f32( @@ -7932,6 +7932,45 @@ static void ggml_compute_forward_argsort_f32(
} }
} }
@@ -61,7 +61,7 @@ index 48c896436..c08e73f3c 100644
void ggml_compute_forward_argsort( void ggml_compute_forward_argsort(
const ggml_compute_params * params, const ggml_compute_params * params,
ggml_tensor * dst) { ggml_tensor * dst) {
@@ -7969,6 +8008,10 @@ void ggml_compute_forward_argsort( @@ -7943,6 +7982,10 @@ void ggml_compute_forward_argsort(
{ {
ggml_compute_forward_argsort_f32(params, dst); ggml_compute_forward_argsort_f32(params, dst);
} break; } break;
@@ -73,10 +73,10 @@ index 48c896436..c08e73f3c 100644
{ {
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
index 4896669c3..6fae8b808 100644 index da9652c3b..b82be371c 100644
--- a/ggml/src/ggml-cuda/argsort.cu --- a/ggml/src/ggml-cuda/argsort.cu
+++ b/ggml/src/ggml-cuda/argsort.cu +++ b/ggml/src/ggml-cuda/argsort.cu
@@ -198,13 +198,107 @@ void argsort_f32_i32_cuda_bitonic(const float * x, @@ -168,13 +168,107 @@ static void argsort_f32_i32_cuda_bitonic(const float * x,
} }
} }
@@ -185,27 +185,28 @@ index 4896669c3..6fae8b808 100644
GGML_ASSERT( dst->type == GGML_TYPE_I32); GGML_ASSERT( dst->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
@@ -213,18 +307,22 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { @@ -183,18 +277,22 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
+ if (src0->type == GGML_TYPE_I32) { -#ifdef GGML_CUDA_USE_CUB
+ argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
+ } else {
#ifdef GGML_CUDA_USE_CUB
- const int ncols_pad = next_power_of_2(ncols); - const int ncols_pad = next_power_of_2(ncols);
- const size_t shared_mem = ncols_pad * sizeof(int); - const size_t shared_mem = ncols_pad * sizeof(int);
- const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb; - const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
+ const int ncols_pad = next_power_of_2(ncols); -
+ const size_t shared_mem = ncols_pad * sizeof(int);
+ const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
- if (shared_mem > max_shared_mem || ncols > 1024) { - if (shared_mem > max_shared_mem || ncols > 1024) {
- ggml_cuda_pool & pool = ctx.pool(); - ggml_cuda_pool & pool = ctx.pool();
- argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, nrows, order, stream); - argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, nrows, order, stream);
- } else { + if (src0->type == GGML_TYPE_I32) {
+ argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
} else {
- argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream); - argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
- } - }
+#ifdef GGML_CUDA_USE_CUB
+ const int ncols_pad = next_power_of_2(ncols);
+ const size_t shared_mem = ncols_pad * sizeof(int);
+ const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
+
+ if (shared_mem > max_shared_mem || ncols > 1024) { + if (shared_mem > max_shared_mem || ncols > 1024) {
+ ggml_cuda_pool & pool = ctx.pool(); + ggml_cuda_pool & pool = ctx.pool();
+ argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, nrows, order, stream); + argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, nrows, order, stream);
@@ -233,10 +234,10 @@ index 7697c292d..00d773dd3 100644
+ *dst = *src; + *dst = *src;
+} +}
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
index ee84303ef..178e82d76 100644 index c4ceb4fc5..0e53ecc39 100644
--- a/ggml/src/ggml-cuda/cpy.cu --- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu
@@ -369,6 +369,43 @@ static void ggml_cpy_f32_iq4_nl_cuda( @@ -352,6 +352,43 @@ static void ggml_cpy_f32_iq4_nl_cuda(
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
@@ -280,7 +281,7 @@ index ee84303ef..178e82d76 100644
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) { void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
const int64_t ne = ggml_nelements(src0); const int64_t ne = ggml_nelements(src0);
GGML_ASSERT(ne == ggml_nelements(src1)); GGML_ASSERT(ne == ggml_nelements(src1));
@@ -495,6 +532,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg @@ -481,6 +518,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
ggml_cpy_scalar_cuda<half, float> ggml_cpy_scalar_cuda<half, float>
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} }
@@ -291,10 +292,10 @@ index ee84303ef..178e82d76 100644
if (can_be_transposed) { if (can_be_transposed) {
ggml_cpy_scalar_cuda<nv_bfloat16, nv_bfloat16, true> ggml_cpy_scalar_cuda<nv_bfloat16, nv_bfloat16, true>
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 17e358d1a..2e463bd99 100644 index 51bcbae30..236838e9e 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal --- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4955,8 +4955,77 @@ kernel void kernel_argsort_f32_i32( @@ -4954,8 +4954,77 @@ kernel void kernel_argsort_f32_i32(
} }
} }
@@ -372,7 +373,7 @@ index 17e358d1a..2e463bd99 100644
typedef void (argsort_merge_t)( typedef void (argsort_merge_t)(
constant ggml_metal_kargs_argsort_merge & args, constant ggml_metal_kargs_argsort_merge & args,
@@ -5111,8 +5180,154 @@ kernel void kernel_argsort_merge_f32_i32( @@ -5110,8 +5179,154 @@ kernel void kernel_argsort_merge_f32_i32(
} }
} }

View File

@@ -23,7 +23,7 @@ index 78aa059dd..7fa8403b3 100644
// Utils // Utils
// Create a buffer and allocate all the tensors in a ggml_context // Create a buffer and allocate all the tensors in a ggml_context
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index a9d177864..393c329be 100644 index 4ed5f3577..a7ebe5dcd 100644
--- a/ggml/include/ggml-backend.h --- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h
@@ -319,6 +319,7 @@ extern "C" { @@ -319,6 +319,7 @@ extern "C" {
@@ -121,7 +121,7 @@ index 41419b617..73b39bfea 100644
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) { static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 9e67c769a..20b37a0b3 100644 index 9f37ca70c..1459d16dd 100644
--- a/ggml/src/ggml-backend.cpp --- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp
@@ -1859,6 +1859,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe @@ -1859,6 +1859,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe

View File

@@ -1,32 +1,31 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: nobody <> From: Daniel Hiltgen <daniel@ollama.com>
Date: Sat, 10 Jan 2026 15:27:57 -0800 Date: Sun, 30 Nov 2025 11:05:56 -0800
Subject: [PATCH] ggml: Export GPU UUIDs Subject: [PATCH] ggml: Export GPU UUIDs
--- ---
ggml/include/ggml-backend.h | 2 + ggml/include/ggml-backend.h | 1 +
ggml/src/ggml-cuda/ggml-cuda.cu | 72 +++++++++++++++++++++++++++--- ggml/src/ggml-cuda/ggml-cuda.cu | 67 +++++++++++++++++++++++++++---
ggml/src/ggml-metal/ggml-metal.cpp | 1 + ggml/src/ggml-metal/ggml-metal.cpp | 1 +
3 files changed, 69 insertions(+), 6 deletions(-) 3 files changed, 63 insertions(+), 6 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 393c329be..99412fe56 100644 index a7ebe5dcd..03557bb31 100644
--- a/ggml/include/ggml-backend.h --- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h
@@ -158,6 +158,8 @@ extern "C" { @@ -158,6 +158,7 @@ extern "C" {
const char * description; const char * description;
// device free memory in bytes // device free memory in bytes
size_t memory_free; size_t memory_free;
+ // device UUID
+ const char * id; + const char * id;
// device total memory in bytes // device total memory in bytes
size_t memory_total; size_t memory_total;
// device type // device type
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 290d762ad..9b9e053f0 100644 index 6519af435..c9d3a2b03 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -191,6 +191,51 @@ static int ggml_cuda_parse_id(char devName[]) { @@ -189,6 +189,51 @@ static int ggml_cuda_parse_id(char devName[]) {
} }
#endif // defined(GGML_USE_HIP) #endif // defined(GGML_USE_HIP)
@@ -78,7 +77,7 @@ index 290d762ad..9b9e053f0 100644
static ggml_cuda_device_info ggml_cuda_init() { static ggml_cuda_device_info ggml_cuda_init() {
ggml_cuda_device_info info = {}; ggml_cuda_device_info info = {};
@@ -255,22 +300,29 @@ static ggml_cuda_device_info ggml_cuda_init() { @@ -255,22 +300,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].cc += prop.minor * 0x10; info.devices[id].cc += prop.minor * 0x10;
} }
} }
@@ -103,26 +102,21 @@ index 290d762ad..9b9e053f0 100644
info.devices[id].cc = 100*prop.major + 10*prop.minor; info.devices[id].cc = 100*prop.major + 10*prop.minor;
- GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", - GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
- id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); - id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
+#ifdef __CUDA_ARCH_LIST__
+ if (std::getenv("GGML_CUDA_INIT") != NULL) {
+ GGML_ASSERT(ggml_cuda_has_arch(info.devices[id].cc) && "ggml was not compiled with support for this arch");
+ }
+#endif // defined(__CUDA_ARCH_LIST__)
+ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n", + GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
+ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no", + id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
+ ggml_cuda_parse_uuid(prop, id).c_str()); + ggml_cuda_parse_uuid(prop, id).c_str());
std::string device_name(prop.name); std::string device_name(prop.name);
if (device_name == "NVIDIA GeForce MX450") { if (device_name == "NVIDIA GeForce MX450") {
turing_devices_without_mma.push_back({ id, device_name }); turing_devices_without_mma.push_back({ id, device_name });
@@ -4155,6 +4207,7 @@ struct ggml_backend_cuda_device_context { @@ -4110,6 +4157,7 @@ struct ggml_backend_cuda_device_context {
std::string name; std::string name;
std::string description; std::string description;
std::string pci_bus_id; std::string pci_bus_id;
+ std::string id; + std::string id;
int op_offload_min_batch_size;
}; };
@@ -4244,6 +4297,11 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -4198,6 +4246,11 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k
} }
#endif // defined(__linux__) #endif // defined(__linux__)
@@ -134,7 +128,7 @@ index 290d762ad..9b9e053f0 100644
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device); ggml_cuda_set_device(ctx->device);
@@ -4284,6 +4342,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back @@ -4238,6 +4291,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
props->name = ggml_backend_cuda_device_get_name(dev); props->name = ggml_backend_cuda_device_get_name(dev);
props->description = ggml_backend_cuda_device_get_description(dev); props->description = ggml_backend_cuda_device_get_description(dev);
@@ -142,7 +136,7 @@ index 290d762ad..9b9e053f0 100644
props->type = ggml_backend_cuda_device_get_type(dev); props->type = ggml_backend_cuda_device_get_type(dev);
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str(); props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total); ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
@@ -4900,6 +4959,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { @@ -4834,6 +4888,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
cudaDeviceProp prop; cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name; dev_ctx->description = prop.name;
@@ -151,7 +145,7 @@ index 290d762ad..9b9e053f0 100644
char pci_bus_id[16] = {}; char pci_bus_id[16] = {};
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID); snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
index 790cabca0..516d74064 100644 index f2b7fe692..8fc1c2fb5 100644
--- a/ggml/src/ggml-metal/ggml-metal.cpp --- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp +++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -547,6 +547,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen @@ -547,6 +547,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen

Some files were not shown because too many files have changed in this diff Show More