Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
af2146d
feat: Gemma 4 support
giladgd Apr 6, 2026
2471df8
fix: Gemma 4 resource requirements estimation
giladgd Apr 12, 2026
67b60c6
feat: more precise resource usage estimation, auto flash attention, r…
giladgd Apr 28, 2026
2fe0dd9
fix: Vulkan backend successful load detection even when no devices ar…
giladgd Apr 28, 2026
d5c4c2c
feat: optimize grammar sampling performance
giladgd Apr 28, 2026
2f01d10
fix: resolve Gemma 4 chat wrapper for relevant models
giladgd Apr 28, 2026
4ce206b
test: gemma 4 function calling
giladgd Apr 28, 2026
dacca3b
feat: `useMmap: "auto"`, bug fixes, fix tests
giladgd Apr 28, 2026
3142124
feat: support `Q1_0` quant, fix `MXFP4_MOE` quant name
giladgd Apr 28, 2026
b48681f
fix: apply `llama.cpp` patches if pending PRs aren't merged yet
giladgd May 5, 2026
74fef2f
fix: adapt to breaking `llama.cpp` changes
giladgd May 5, 2026
c772709
test: fix tests
giladgd May 5, 2026
40d204d
fix: bug
giladgd May 5, 2026
0acdc31
test: fix tests
giladgd May 5, 2026
ea7fce0
fix: type
giladgd May 5, 2026
3fc0363
test: fix tests
giladgd May 6, 2026
aa50af2
fix: don't crash on unsupported model architecture
giladgd May 6, 2026
ca607fd
feat: improve stability on unified memory systems
giladgd May 20, 2026
79543d9
fix: bugs
giladgd May 20, 2026
547c692
fix: correct wired memory calculation
giladgd May 20, 2026
fe284fe
fix: improve measure safety
giladgd May 20, 2026
5ef1c2b
fix: bug
giladgd May 20, 2026
1c62b87
fix: bug
giladgd May 20, 2026
b099ead
fix: bugs
giladgd May 20, 2026
6b387e6
fix: remove patch for merged PR
giladgd May 25, 2026
720a2d2
Merge remote-tracking branch 'origin/master' into gilad/gemma4
giladgd May 26, 2026
7f91df0
feat: try using github token to fetch latest llama.cpp release on rat…
giladgd May 26, 2026
cb6f8c1
feat: disabled residency sets on macOS by default for better OS respo…
giladgd May 26, 2026
6977bcd
fix: bug
giladgd May 26, 2026
9d9cccb
fix: Windows LLVM toolchain
giladgd May 27, 2026
0cf657e
feat: more optimized local build
giladgd May 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .vitepress/config/apiReferenceSidebar.ts
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ const chatWrappersOrder = [
"Llama3ChatWrapper",
"Llama2ChatWrapper",
"MistralChatWrapper",
"Gemma4ChatWrapper",
"GemmaChatWrapper",
"ChatMLChatWrapper",
"FalconChatWrapper",
Expand Down
4 changes: 2 additions & 2 deletions llama/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,8 @@ list(REMOVE_DUPLICATES GPU_INFO_HEADERS)
list(REMOVE_DUPLICATES GPU_INFO_SOURCES)
list(REMOVE_DUPLICATES GPU_INFO_EXTRA_LIBS)

addVariantSuffix(llama ${NLC_VARIANT})
addVariantSuffix(ggml ${NLC_VARIANT})
addVariantSuffix(llama "${NLC_VARIANT}")
addVariantSuffix(ggml "${NLC_VARIANT}")

file(GLOB SOURCE_FILES "addon/*.cpp" "addon/**/*.cpp" ${GPU_INFO_SOURCES})

Expand Down
107 changes: 76 additions & 31 deletions llama/addon/AddonContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include <algorithm>
#include <cmath>
#include "common/common.h"
#include "llama-context.h"
#include "llama-vocab.h"
#include "llama.h"

Expand Down Expand Up @@ -107,15 +108,15 @@ class AddonContextLoadContextWorker : public Napi::AsyncWorker {
try {
context->ctx = llama_init_from_model(context->model->model, context->context_params);

context->contextLoaded = context->ctx != nullptr && context->ctx != NULL;
context->contextLoaded = context->ctx != nullptr;
} catch (const std::exception& e) {
SetError(e.what());
} catch(...) {
SetError("Unknown error when calling \"llama_init_from_model\"");
}
}
void OnOK() {
if (context->contextLoaded) {
if (context->contextLoaded && !context->model->model_params.no_alloc) {
uint64_t contextMemorySize = llama_state_get_size(context->ctx);
adjustNapiExternalMemoryAdd(Env(), contextMemorySize);
context->loadedContextMemorySize = contextMemorySize;
Expand Down Expand Up @@ -173,8 +174,10 @@ class AddonContextUnloadContextWorker : public Napi::AsyncWorker {
}
}
void OnOK() {
adjustNapiExternalMemorySubtract(Env(), context->loadedContextMemorySize);
context->loadedContextMemorySize = 0;
if (!context->model->model_params.no_alloc) {
adjustNapiExternalMemorySubtract(Env(), context->loadedContextMemorySize);
context->loadedContextMemorySize = 0;
}

adjustNapiExternalMemorySubtract(Env(), context->batchMemorySize);
context->batchMemorySize = 0;
Expand Down Expand Up @@ -251,22 +254,8 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {

sampler->rebuildChainIfNeeded();

const auto * logits = llama_get_logits_ith(ctx->ctx, batchLogitIndex);
const int n_vocab = llama_vocab_n_tokens(ctx->model->vocab);

auto & candidates = sampler->tokenCandidates;
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
}

llama_token_data_array cur_p = {
/* .data = */ candidates.data(),
/* .size = */ candidates.size(),
/* .selected = */ -1,
/* .sorted = */ false,
};

llama_sampler_apply(sampler->chain, &cur_p);
llama_token_data_array cur_p;
sampler->sample(ctx->ctx, batchLogitIndex, cur_p, returnProbabilities || returnConfidence);

if (!(cur_p.selected >= 0 && cur_p.selected < (int32_t)cur_p.size)) {
no_output = true;
Expand Down Expand Up @@ -403,7 +392,7 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Ad
context_params.swa_full = false;

if (info.Length() > 1 && info[1].IsObject()) {
Napi::Object options = info[1].As<Napi::Object>();
const auto options = info[1].As<Napi::Object>();

if (options.Has("contextSize")) {
context_params.n_ctx = options.Get("contextSize").As<Napi::Number>().Uint32Value();
Expand All @@ -427,31 +416,41 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Ad
}

if (options.Has("flashAttention")) {
bool flashAttention = options.Get("flashAttention").As<Napi::Boolean>().Value();
context_params.flash_attn_type = flashAttention ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED;
const auto flashAttention = options.Get("flashAttention");

if (flashAttention.IsString() && flashAttention.As<Napi::String>().Utf8Value() == "auto") {
context_params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
} else {
const bool flashAttentionEnabled = flashAttention.As<Napi::Boolean>().Value();
context_params.flash_attn_type = flashAttentionEnabled
? LLAMA_FLASH_ATTN_TYPE_ENABLED
: LLAMA_FLASH_ATTN_TYPE_DISABLED;
}
}

if (options.Has("threads")) {
const auto n_threads = options.Get("threads").As<Napi::Number>().Int32Value();
const auto resolved_n_threads = n_threads == 0 ? std::max((int32_t)std::thread::hardware_concurrency(), context_params.n_threads) : n_threads;
const auto threads = options.Get("threads").As<Napi::Number>().Int32Value();
const auto resolvedThreads = threads == 0
? std::max((int32_t)std::thread::hardware_concurrency(), context_params.n_threads)
: threads;

context_params.n_threads = resolved_n_threads;
context_params.n_threads_batch = resolved_n_threads;
context_params.n_threads = resolvedThreads;
context_params.n_threads_batch = resolvedThreads;
}

if (options.Has("performanceTracking")) {
context_params.no_perf = !(options.Get("performanceTracking").As<Napi::Boolean>().Value());
}

if (options.Has("kvCacheKeyType") && options.Get("kvCacheKeyType").IsNumber()) {
auto keyType = options.Get("kvCacheKeyType").As<Napi::Number>().Int32Value();
const auto keyType = options.Get("kvCacheKeyType").As<Napi::Number>().Int32Value();
if (keyType >= 0 && keyType < GGML_TYPE_COUNT) {
context_params.type_k = static_cast<ggml_type>(keyType);
}
}

if (options.Has("kvCacheValueType") && options.Get("kvCacheValueType").IsNumber()) {
auto valueType = options.Get("kvCacheValueType").As<Napi::Number>().Int32Value();
const auto valueType = options.Get("kvCacheValueType").As<Napi::Number>().Int32Value();
if (valueType >= 0 && valueType < GGML_TYPE_COUNT) {
context_params.type_v = static_cast<ggml_type>(valueType);
}
Expand All @@ -476,8 +475,10 @@ void AddonContext::dispose() {
contextLoaded = false;
llama_free(ctx);

adjustNapiExternalMemorySubtract(Env(), loadedContextMemorySize);
loadedContextMemorySize = 0;
if (!model->model_params.no_alloc) {
adjustNapiExternalMemorySubtract(Env(), loadedContextMemorySize);
loadedContextMemorySize = 0;
}
}

model->Unref();
Expand Down Expand Up @@ -728,6 +729,49 @@ Napi::Value AddonContext::GetStateSize(const Napi::CallbackInfo& info) {
return Napi::Number::From(info.Env(), llama_state_get_size(ctx));
}

Napi::Value AddonContext::GetMemoryBreakdown(const Napi::CallbackInfo& info) {
if (disposed) {
Napi::Error::New(info.Env(), "Context is disposed").ThrowAsJavaScriptException();
return info.Env().Undefined();
}

if (!contextLoaded || ctx == nullptr) {
Napi::Error::New(info.Env(), "Context is not loaded").ThrowAsJavaScriptException();
return info.Env().Undefined();
}

std::size_t cpuRam = 0;
std::size_t gpuVram = 0;

for (const auto& [bufferType, memoryBreakdown] : ctx->memory_breakdown()) {
const std::size_t size = memoryBreakdown.context + memoryBreakdown.compute;
if (size == 0) {
continue;
}

if (ggml_backend_buft_is_host(bufferType)) {
cpuRam += size;
} else {
ggml_backend_dev_t device = ggml_backend_buft_get_device(bufferType);
if (device != nullptr) {
auto deviceType = ggml_backend_dev_type(device);
if (deviceType == GGML_BACKEND_DEVICE_TYPE_GPU || deviceType == GGML_BACKEND_DEVICE_TYPE_IGPU) {
gpuVram += size;
} else {
cpuRam += size;
}
} else {
cpuRam += size;
}
}
}

Napi::Object result = Napi::Object::New(info.Env());
result.Set("cpuRam", Napi::Number::New(info.Env(), cpuRam));
result.Set("gpuVram", Napi::Number::New(info.Env(), gpuVram));
return result;
}

Napi::Value AddonContext::GetThreads(const Napi::CallbackInfo& info) {
if (disposed) {
Napi::Error::New(info.Env(), "Context is disposed").ThrowAsJavaScriptException();
Expand Down Expand Up @@ -1062,6 +1106,7 @@ void AddonContext::init(Napi::Object exports) {
InstanceMethod("sampleToken", &AddonContext::SampleToken),
InstanceMethod("getEmbedding", &AddonContext::GetEmbedding),
InstanceMethod("getStateSize", &AddonContext::GetStateSize),
InstanceMethod("getMemoryBreakdown", &AddonContext::GetMemoryBreakdown),
InstanceMethod("getThreads", &AddonContext::GetThreads),
InstanceMethod("setThreads", &AddonContext::SetThreads),
InstanceMethod("printTimings", &AddonContext::PrintTimings),
Expand Down
1 change: 1 addition & 0 deletions llama/addon/AddonContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class AddonContext : public Napi::ObjectWrap<AddonContext> {

Napi::Value GetEmbedding(const Napi::CallbackInfo& info);
Napi::Value GetStateSize(const Napi::CallbackInfo& info);
Napi::Value GetMemoryBreakdown(const Napi::CallbackInfo& info);
Napi::Value GetThreads(const Napi::CallbackInfo& info);
Napi::Value SetThreads(const Napi::CallbackInfo& info);

Expand Down
162 changes: 162 additions & 0 deletions llama/addon/AddonGgufMetadata.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
#include <cstddef>

#include "AddonGgufMetadata.h"
#include "gguf.h"


AddonGgufMetadata::AddonGgufMetadata(const Napi::CallbackInfo& info)
: Napi::ObjectWrap<AddonGgufMetadata>(info),
ggufMetadata(gguf_init_empty()) {
if (ggufMetadata.get() == nullptr) {
throw std::runtime_error("Failed to create an empty GGUF context");
}
}
AddonGgufMetadata::~AddonGgufMetadata() {
dispose();
}

void AddonGgufMetadata::dispose() {
if (disposed) {
return;
}

disposed = true;
ggufMetadata.reset();
}

Napi::Value AddonGgufMetadata::Dispose(const Napi::CallbackInfo& info) {
dispose();
return info.Env().Undefined();
}

class AddonGgufMetadataInitWorker : public Napi::AsyncWorker {
public:
AddonGgufMetadata* addonGgufMetadata;
std::vector<AddonGgufMetadataSource> sources;
std::vector<Napi::Reference<Napi::Buffer<uint8_t>>> bufferRefs;

AddonGgufMetadataInitWorker(const Napi::Env& env, AddonGgufMetadata* addonGgufMetadata)
: Napi::AsyncWorker(env, "AddonGgufMetadataInitWorker"),
addonGgufMetadata(addonGgufMetadata),
deferred(Napi::Promise::Deferred::New(env)) {
addonGgufMetadata->Ref();
}
~AddonGgufMetadataInitWorker() {
addonGgufMetadata->Unref();
}

Napi::Promise GetPromise() {
return deferred.Promise();
}

protected:
Napi::Promise::Deferred deferred;

void Execute() {
try {
gguf_context_ptr& ggufMetadata = addonGgufMetadata->ggufMetadata;

bool hasCopiedMetadata = false;
for (const auto& itemSource : sources) {
struct ggml_context* tensorContext = nullptr;
struct gguf_init_params ggufParams = {
/* .no_alloc = */ true,
/* .ctx = */ &tensorContext,
};
gguf_context_ptr metadata(
itemSource.type == AddonGgufMetadataSourceType::buffer
? gguf_init_from_buffer(itemSource.buffer.data, itemSource.buffer.length, ggufParams)
: gguf_init_from_file(itemSource.path.c_str(), ggufParams)
);
ggml_context_ptr tensorContextGuard(tensorContext);

if (metadata.get() == nullptr || tensorContext == nullptr) {
throw std::runtime_error("Failed to parse GGUF metadata buffer");
}

if (!hasCopiedMetadata) {
gguf_set_kv(ggufMetadata.get(), metadata.get());
hasCopiedMetadata = true;
}

for (ggml_tensor* tensor = ggml_get_first_tensor(tensorContext); tensor != nullptr;
tensor = ggml_get_next_tensor(tensorContext, tensor)) {
gguf_add_tensor(ggufMetadata.get(), tensor);
}
}
} catch (const std::exception& e) {
SetError(e.what());
} catch (...) {
SetError("Unknown error when loading GGUF metadata from the given sources");
}
}
void OnOK() {
deferred.Resolve(Env().Undefined());
}
void OnError(const Napi::Error& err) {
deferred.Reject(err.Value());
}
};

Napi::Value AddonGgufMetadata::Init(const Napi::CallbackInfo& info) {
if (disposed) {
Napi::Error::New(info.Env(), "Metadata is disposed").ThrowAsJavaScriptException();
return info.Env().Undefined();
}

std::vector<AddonGgufMetadataSource> metadataSources;
std::vector<Napi::Reference<Napi::Buffer<uint8_t>>> bufferRefs;

if (info.Length() == 0 || !info[0].IsArray()) {
Napi::TypeError::New(info.Env(), "Expected an array of sources as the first argument").ThrowAsJavaScriptException();
return info.Env().Undefined();
}

const auto sourceBufferValues = info[0].As<Napi::Array>();
const uint32_t sourcesCount = sourceBufferValues.Length();

if (sourcesCount == 0) {
Napi::TypeError::New(info.Env(), "Expected source array to contain at least one item").ThrowAsJavaScriptException();
return info.Env().Undefined();
}

metadataSources.reserve(sourcesCount);
bufferRefs.reserve(sourcesCount);

for (uint32_t i = 0; i < sourcesCount; i++) {
const auto sourceBufferValue = sourceBufferValues.Get(i);
if (sourceBufferValue.IsBuffer()) {
const auto sourceBuffer = sourceBufferValue.As<Napi::Buffer<uint8_t>>();
metadataSources.emplace_back(AddonGgufMetadataSource(AddonGgufMetadataSourceBuffer(sourceBuffer.Data(), sourceBuffer.Length())));
bufferRefs.emplace_back(Napi::Persistent(sourceBuffer));
} else if (sourceBufferValue.IsString()) {
const auto sourcePath = sourceBufferValue.As<Napi::String>().Utf8Value();
metadataSources.emplace_back(AddonGgufMetadataSource(sourcePath));
} else {
Napi::TypeError::New(info.Env(), "Expected every source array item to be a Buffer or a string").ThrowAsJavaScriptException();
return info.Env().Undefined();
}
}


AddonGgufMetadataInitWorker* worker = new AddonGgufMetadataInitWorker(info.Env(), this);
worker->sources.swap(metadataSources);
worker->bufferRefs.swap(bufferRefs);

worker->Queue();
return worker->GetPromise();
}

void AddonGgufMetadata::init(Napi::Object exports) {
exports.Set(
"AddonGgufMetadata",
DefineClass(
exports.Env(),
"AddonGgufMetadata",
{
InstanceMethod("init", &AddonGgufMetadata::Init),
InstanceMethod("dispose", &AddonGgufMetadata::Dispose),
}
)
);
}
Loading
Loading