withcatai · giladgd · Apr 6, 2026 · Apr 12, 2026 · Apr 28, 2026 · Apr 28, 2026
diff --git a/.vitepress/config/apiReferenceSidebar.ts b/.vitepress/config/apiReferenceSidebar.ts
@@ -53,6 +53,7 @@ const chatWrappersOrder = [
     "Llama3ChatWrapper",
     "Llama2ChatWrapper",
     "MistralChatWrapper",
+    "Gemma4ChatWrapper",
     "GemmaChatWrapper",
     "ChatMLChatWrapper",
     "FalconChatWrapper",

diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt
@@ -120,8 +120,8 @@ list(REMOVE_DUPLICATES GPU_INFO_HEADERS)
 list(REMOVE_DUPLICATES GPU_INFO_SOURCES)
 list(REMOVE_DUPLICATES GPU_INFO_EXTRA_LIBS)
 
-addVariantSuffix(llama ${NLC_VARIANT})
-addVariantSuffix(ggml ${NLC_VARIANT})
+addVariantSuffix(llama "${NLC_VARIANT}")
+addVariantSuffix(ggml "${NLC_VARIANT}")
 
 file(GLOB SOURCE_FILES "addon/*.cpp" "addon/**/*.cpp" ${GPU_INFO_SOURCES})
 

diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp
@@ -2,6 +2,7 @@
 #include <algorithm>
 #include <cmath>
 #include "common/common.h"
+#include "llama-context.h"
 #include "llama-vocab.h"
 #include "llama.h"
 
@@ -107,15 +108,15 @@ class AddonContextLoadContextWorker : public Napi::AsyncWorker {
             try {
                 context->ctx = llama_init_from_model(context->model->model, context->context_params);
 
-                context->contextLoaded = context->ctx != nullptr && context->ctx != NULL;
+                context->contextLoaded = context->ctx != nullptr;
             } catch (const std::exception& e) {
                 SetError(e.what());
             } catch(...) {
                 SetError("Unknown error when calling \"llama_init_from_model\"");
             }
         }
         void OnOK() {
-            if (context->contextLoaded) {
+            if (context->contextLoaded && !context->model->model_params.no_alloc) {
                 uint64_t contextMemorySize = llama_state_get_size(context->ctx);
                 adjustNapiExternalMemoryAdd(Env(), contextMemorySize);
                 context->loadedContextMemorySize = contextMemorySize;
@@ -173,8 +174,10 @@ class AddonContextUnloadContextWorker : public Napi::AsyncWorker {
             }
         }
         void OnOK() {
-            adjustNapiExternalMemorySubtract(Env(), context->loadedContextMemorySize);
-            context->loadedContextMemorySize = 0;
+            if (!context->model->model_params.no_alloc) {
+                adjustNapiExternalMemorySubtract(Env(), context->loadedContextMemorySize);
+                context->loadedContextMemorySize = 0;
+            }
 
             adjustNapiExternalMemorySubtract(Env(), context->batchMemorySize);
             context->batchMemorySize = 0;
@@ -251,22 +254,8 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
 
             sampler->rebuildChainIfNeeded();
 
-            const auto * logits = llama_get_logits_ith(ctx->ctx, batchLogitIndex);
-            const int n_vocab = llama_vocab_n_tokens(ctx->model->vocab);
-
-            auto & candidates = sampler->tokenCandidates;
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
-            }
-
-            llama_token_data_array cur_p = {
-                /* .data       = */ candidates.data(),
-                /* .size       = */ candidates.size(),
-                /* .selected   = */ -1,
-                /* .sorted     = */ false,
-            };
-
-            llama_sampler_apply(sampler->chain, &cur_p);
+            llama_token_data_array cur_p;
+            sampler->sample(ctx->ctx, batchLogitIndex, cur_p, returnProbabilities || returnConfidence);
 
             if (!(cur_p.selected >= 0 && cur_p.selected < (int32_t)cur_p.size)) {
                 no_output = true;
@@ -403,7 +392,7 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Ad
     context_params.swa_full = false;
 
     if (info.Length() > 1 && info[1].IsObject()) {
-        Napi::Object options = info[1].As<Napi::Object>();
+        const auto options = info[1].As<Napi::Object>();
 
         if (options.Has("contextSize")) {
             context_params.n_ctx = options.Get("contextSize").As<Napi::Number>().Uint32Value();
@@ -427,31 +416,41 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Ad
         }
 
         if (options.Has("flashAttention")) {
-            bool flashAttention = options.Get("flashAttention").As<Napi::Boolean>().Value();
-            context_params.flash_attn_type = flashAttention ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED;
+            const auto flashAttention = options.Get("flashAttention");
+
+            if (flashAttention.IsString() && flashAttention.As<Napi::String>().Utf8Value() == "auto") {
+                context_params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
+            } else {
+                const bool flashAttentionEnabled = flashAttention.As<Napi::Boolean>().Value();
+                context_params.flash_attn_type = flashAttentionEnabled
+                    ? LLAMA_FLASH_ATTN_TYPE_ENABLED
+                    : LLAMA_FLASH_ATTN_TYPE_DISABLED;
+            }
         }
 
         if (options.Has("threads")) {
-            const auto n_threads = options.Get("threads").As<Napi::Number>().Int32Value();
-            const auto resolved_n_threads = n_threads == 0 ? std::max((int32_t)std::thread::hardware_concurrency(), context_params.n_threads) : n_threads;
+            const auto threads = options.Get("threads").As<Napi::Number>().Int32Value();
+            const auto resolvedThreads = threads == 0
+                ? std::max((int32_t)std::thread::hardware_concurrency(), context_params.n_threads)
+                : threads;
 
-            context_params.n_threads = resolved_n_threads;
-            context_params.n_threads_batch = resolved_n_threads;
+            context_params.n_threads = resolvedThreads;
+            context_params.n_threads_batch = resolvedThreads;
         }
 
         if (options.Has("performanceTracking")) {
             context_params.no_perf = !(options.Get("performanceTracking").As<Napi::Boolean>().Value());
         }
 
         if (options.Has("kvCacheKeyType") && options.Get("kvCacheKeyType").IsNumber()) {
-            auto keyType = options.Get("kvCacheKeyType").As<Napi::Number>().Int32Value();
+            const auto keyType = options.Get("kvCacheKeyType").As<Napi::Number>().Int32Value();
             if (keyType >= 0 && keyType < GGML_TYPE_COUNT) {
                 context_params.type_k = static_cast<ggml_type>(keyType);
             }
         }
 
         if (options.Has("kvCacheValueType") && options.Get("kvCacheValueType").IsNumber()) {
-            auto valueType = options.Get("kvCacheValueType").As<Napi::Number>().Int32Value();
+            const auto valueType = options.Get("kvCacheValueType").As<Napi::Number>().Int32Value();
             if (valueType >= 0 && valueType < GGML_TYPE_COUNT) {
                 context_params.type_v = static_cast<ggml_type>(valueType);
             }
@@ -476,8 +475,10 @@ void AddonContext::dispose() {
         contextLoaded = false;
         llama_free(ctx);
 
-        adjustNapiExternalMemorySubtract(Env(), loadedContextMemorySize);
-        loadedContextMemorySize = 0;
+        if (!model->model_params.no_alloc) {
+            adjustNapiExternalMemorySubtract(Env(), loadedContextMemorySize);
+            loadedContextMemorySize = 0;
+        }
     }
 
     model->Unref();
@@ -728,6 +729,49 @@ Napi::Value AddonContext::GetStateSize(const Napi::CallbackInfo& info) {
     return Napi::Number::From(info.Env(), llama_state_get_size(ctx));
 }
 
+Napi::Value AddonContext::GetMemoryBreakdown(const Napi::CallbackInfo& info) {
+    if (disposed) {
+        Napi::Error::New(info.Env(), "Context is disposed").ThrowAsJavaScriptException();
+        return info.Env().Undefined();
+    }
+
+    if (!contextLoaded || ctx == nullptr) {
+        Napi::Error::New(info.Env(), "Context is not loaded").ThrowAsJavaScriptException();
+        return info.Env().Undefined();
+    }
+
+    std::size_t cpuRam = 0;
+    std::size_t gpuVram = 0;
+
+    for (const auto& [bufferType, memoryBreakdown] : ctx->memory_breakdown()) {
+        const std::size_t size = memoryBreakdown.context + memoryBreakdown.compute;
+        if (size == 0) {
+            continue;
+        }
+
+        if (ggml_backend_buft_is_host(bufferType)) {
+            cpuRam += size;
+        } else {
+            ggml_backend_dev_t device = ggml_backend_buft_get_device(bufferType);
+            if (device != nullptr) {
+                auto deviceType = ggml_backend_dev_type(device);
+                if (deviceType == GGML_BACKEND_DEVICE_TYPE_GPU || deviceType == GGML_BACKEND_DEVICE_TYPE_IGPU) {
+                    gpuVram += size;
+                } else {
+                    cpuRam += size;
+                }
+            } else {
+                cpuRam += size;
+            }
+        }
+    }
+
+    Napi::Object result = Napi::Object::New(info.Env());
+    result.Set("cpuRam", Napi::Number::New(info.Env(), cpuRam));
+    result.Set("gpuVram", Napi::Number::New(info.Env(), gpuVram));
+    return result;
+}
+
 Napi::Value AddonContext::GetThreads(const Napi::CallbackInfo& info) {
     if (disposed) {
         Napi::Error::New(info.Env(), "Context is disposed").ThrowAsJavaScriptException();
@@ -1062,6 +1106,7 @@ void AddonContext::init(Napi::Object exports) {
                 InstanceMethod("sampleToken", &AddonContext::SampleToken),
                 InstanceMethod("getEmbedding", &AddonContext::GetEmbedding),
                 InstanceMethod("getStateSize", &AddonContext::GetStateSize),
+                InstanceMethod("getMemoryBreakdown", &AddonContext::GetMemoryBreakdown),
                 InstanceMethod("getThreads", &AddonContext::GetThreads),
                 InstanceMethod("setThreads", &AddonContext::SetThreads),
                 InstanceMethod("printTimings", &AddonContext::PrintTimings),

diff --git a/llama/addon/AddonContext.h b/llama/addon/AddonContext.h
@@ -46,6 +46,7 @@ class AddonContext : public Napi::ObjectWrap<AddonContext> {
 
         Napi::Value GetEmbedding(const Napi::CallbackInfo& info);
         Napi::Value GetStateSize(const Napi::CallbackInfo& info);
+        Napi::Value GetMemoryBreakdown(const Napi::CallbackInfo& info);
         Napi::Value GetThreads(const Napi::CallbackInfo& info);
         Napi::Value SetThreads(const Napi::CallbackInfo& info);
 

diff --git a/llama/addon/AddonGgufMetadata.cpp b/llama/addon/AddonGgufMetadata.cpp
@@ -0,0 +1,162 @@
+#include <cstddef>
+
+#include "AddonGgufMetadata.h"
+#include "gguf.h"
+
+
+AddonGgufMetadata::AddonGgufMetadata(const Napi::CallbackInfo& info)
+    : Napi::ObjectWrap<AddonGgufMetadata>(info),
+      ggufMetadata(gguf_init_empty()) {
+    if (ggufMetadata.get() == nullptr) {
+        throw std::runtime_error("Failed to create an empty GGUF context");
+    }
+}
+AddonGgufMetadata::~AddonGgufMetadata() {
+    dispose();
+}
+
+void AddonGgufMetadata::dispose() {
+    if (disposed) {
+        return;
+    }
+
+    disposed = true;
+    ggufMetadata.reset();
+}
+
+Napi::Value AddonGgufMetadata::Dispose(const Napi::CallbackInfo& info) {
+    dispose();
+    return info.Env().Undefined();
+}
+
+class AddonGgufMetadataInitWorker : public Napi::AsyncWorker {
+    public:
+        AddonGgufMetadata* addonGgufMetadata;
+        std::vector<AddonGgufMetadataSource> sources;
+        std::vector<Napi::Reference<Napi::Buffer<uint8_t>>> bufferRefs;
+
+        AddonGgufMetadataInitWorker(const Napi::Env& env, AddonGgufMetadata* addonGgufMetadata)
+            : Napi::AsyncWorker(env, "AddonGgufMetadataInitWorker"),
+              addonGgufMetadata(addonGgufMetadata),
+              deferred(Napi::Promise::Deferred::New(env)) {
+            addonGgufMetadata->Ref();
+        }
+        ~AddonGgufMetadataInitWorker() {
+            addonGgufMetadata->Unref();
+        }
+
+        Napi::Promise GetPromise() {
+            return deferred.Promise();
+        }
+
+    protected:
+        Napi::Promise::Deferred deferred;
+
+        void Execute() {
+            try {
+                gguf_context_ptr& ggufMetadata = addonGgufMetadata->ggufMetadata;
+
+                bool hasCopiedMetadata = false;
+                for (const auto& itemSource : sources) {
+                    struct ggml_context* tensorContext = nullptr;
+                    struct gguf_init_params ggufParams = {
+                        /* .no_alloc = */ true,
+                        /* .ctx = */ &tensorContext,
+                    };
+                    gguf_context_ptr metadata(
+                        itemSource.type == AddonGgufMetadataSourceType::buffer
+                            ? gguf_init_from_buffer(itemSource.buffer.data, itemSource.buffer.length, ggufParams)
+                            : gguf_init_from_file(itemSource.path.c_str(), ggufParams)
+                    );
+                    ggml_context_ptr tensorContextGuard(tensorContext);
+
+                    if (metadata.get() == nullptr || tensorContext == nullptr) {
+                        throw std::runtime_error("Failed to parse GGUF metadata buffer");
+                    }
+
+                    if (!hasCopiedMetadata) {
+                        gguf_set_kv(ggufMetadata.get(), metadata.get());
+                        hasCopiedMetadata = true;
+                    }
+
+                    for (ggml_tensor* tensor = ggml_get_first_tensor(tensorContext); tensor != nullptr;
+                        tensor = ggml_get_next_tensor(tensorContext, tensor)) {
+                        gguf_add_tensor(ggufMetadata.get(), tensor);
+                    }
+                }
+            } catch (const std::exception& e) {
+                SetError(e.what());
+            } catch (...) {
+                SetError("Unknown error when loading GGUF metadata from the given sources");
+            }
+        }
+        void OnOK() {
+            deferred.Resolve(Env().Undefined());
+        }
+        void OnError(const Napi::Error& err) {
+            deferred.Reject(err.Value());
+        }
+};
+
+Napi::Value AddonGgufMetadata::Init(const Napi::CallbackInfo& info) {
+    if (disposed) {
+        Napi::Error::New(info.Env(), "Metadata is disposed").ThrowAsJavaScriptException();
+        return info.Env().Undefined();
+    }
+
+    std::vector<AddonGgufMetadataSource> metadataSources;
+    std::vector<Napi::Reference<Napi::Buffer<uint8_t>>> bufferRefs;
+
+    if (info.Length() == 0 || !info[0].IsArray()) {
+        Napi::TypeError::New(info.Env(), "Expected an array of sources as the first argument").ThrowAsJavaScriptException();
+        return info.Env().Undefined();
+    }
+
+    const auto sourceBufferValues = info[0].As<Napi::Array>();
+    const uint32_t sourcesCount = sourceBufferValues.Length();
+
+    if (sourcesCount == 0) {
+        Napi::TypeError::New(info.Env(), "Expected source array to contain at least one item").ThrowAsJavaScriptException();
+        return info.Env().Undefined();
+    }
+
+    metadataSources.reserve(sourcesCount);
+    bufferRefs.reserve(sourcesCount);
+
+    for (uint32_t i = 0; i < sourcesCount; i++) {
+        const auto sourceBufferValue = sourceBufferValues.Get(i);
+        if (sourceBufferValue.IsBuffer()) {
+            const auto sourceBuffer = sourceBufferValue.As<Napi::Buffer<uint8_t>>();
+            metadataSources.emplace_back(AddonGgufMetadataSource(AddonGgufMetadataSourceBuffer(sourceBuffer.Data(), sourceBuffer.Length())));
+            bufferRefs.emplace_back(Napi::Persistent(sourceBuffer));
+        } else if (sourceBufferValue.IsString()) {
+            const auto sourcePath = sourceBufferValue.As<Napi::String>().Utf8Value();
+            metadataSources.emplace_back(AddonGgufMetadataSource(sourcePath));
+        } else {
+            Napi::TypeError::New(info.Env(), "Expected every source array item to be a Buffer or a string").ThrowAsJavaScriptException();
+            return info.Env().Undefined();
+        }
+    }
+
+
+    AddonGgufMetadataInitWorker* worker = new AddonGgufMetadataInitWorker(info.Env(), this);
+    worker->sources.swap(metadataSources);
+    worker->bufferRefs.swap(bufferRefs);
+
+    worker->Queue();
+    return worker->GetPromise();
+}
+
+void AddonGgufMetadata::init(Napi::Object exports) {
+    exports.Set(
+        "AddonGgufMetadata",
+        DefineClass(
+            exports.Env(),
+            "AddonGgufMetadata",
+            {
+                InstanceMethod("init", &AddonGgufMetadata::Init),
+                InstanceMethod("dispose", &AddonGgufMetadata::Dispose),
+            }
+        )
+    );
+}