Add 6 new API functions from llama.cpp 2026-01-01

Ralf Waldukat · Ralf Waldukat · commit d14a24f7ec5e · 2026-01-04T14:32:50.000+01:00
Implemented bindings for:
- llama_attach_threadpool / llama_detach_threadpool
- llama_params_fit (check if model will fit in memory)
- llama_state_seq_get_size_ext
- llama_state_seq_get_data_ext
- llama_state_seq_set_data_ext

Added type definitions:
- ggml_threadpool_t
- llama_state_seq_flags
- LLAMA_STATE_SEQ_FLAGS_SWA_ONLY
- LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY

All 218 C API functions now have Python bindings.
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -126,6 +126,9 @@
     None, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p
 )
 
+# typedef struct ggml_threadpool * ggml_threadpool_t;
+ggml_threadpool_t = ctypes.c_void_p
+
 # llama.h bindings
 
 _lib.llama_max_devices.argtypes = []
@@ -186,6 +189,13 @@
 # typedef int32_t llama_seq_id;
 llama_seq_id = ctypes.c_int32
 
+# typedef uint32_t llama_state_seq_flags;
+llama_state_seq_flags = ctypes.c_uint32
+
+# State sequence flags
+LLAMA_STATE_SEQ_FLAGS_SWA_ONLY = 1
+LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY = 1
+
 
 # enum llama_vocab_type {
 #     LLAMA_VOCAB_TYPE_NONE   = 0, // For models without vocab
@@ -1197,11 +1207,26 @@ def llama_numa_init(numa: int, /): ...
 #         struct llama_context * ctx,
 #            ggml_threadpool_t   threadpool,
 #            ggml_threadpool_t   threadpool_batch);
-# TODO: Add llama_attach_threadpool
+@ctypes_function(
+    "llama_attach_threadpool",
+    [llama_context_p_ctypes, ggml_threadpool_t, ggml_threadpool_t],
+    None,
+)
+def llama_attach_threadpool(
+    ctx: llama_context_p,
+    threadpool: int,
+    threadpool_batch: int,
+    /,
+):
+    """Attach threadpools to context"""
+    ...
 
 
 # LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
-# TODO: Add llama_detach_threadpool
+@ctypes_function("llama_detach_threadpool", [llama_context_p_ctypes], None)
+def llama_detach_threadpool(ctx: llama_context_p, /):
+    """Detach threadpool from context"""
+    ...
 
 
 # DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
@@ -1375,6 +1400,43 @@ def llama_supports_rpc() -> bool: ...
 def llama_max_tensor_buft_overrides() -> int:
     """Get maximum number of tensor buffer type overrides"""
     ...
+
+
+# LLAMA_API enum llama_params_fit_status llama_params_fit(
+#                                const char   * path_model,
+#                 struct llama_model_params   * mparams,
+#                 struct llama_context_params * cparams,
+#                                       float * tensor_split,
+#     struct llama_model_tensor_buft_override * tensor_buft_overrides,
+#                                      size_t   n_buft_overrides);
+@ctypes_function(
+    "llama_params_fit",
+    [
+        ctypes.c_char_p,
+        ctypes.POINTER(llama_model_params),
+        ctypes.POINTER(llama_context_params),
+        ctypes.POINTER(ctypes.c_float),
+        ctypes.c_void_p,  # tensor_buft_overrides - not fully bound
+        ctypes.c_size_t,
+    ],
+    ctypes.c_int,
+)
+def llama_params_fit(
+    path_model: bytes,
+    mparams: CtypesPointerOrRef[llama_model_params],
+    cparams: CtypesPointerOrRef[llama_context_params],
+    tensor_split: CtypesArray[ctypes.c_float],
+    tensor_buft_overrides: int,
+    n_buft_overrides: Union[ctypes.c_size_t, int],
+    /,
+) -> int:
+    """Check if model parameters will fit in memory
+
+    Returns:
+        LLAMA_PARAMS_FIT_STATUS_SUCCESS (0) - found allocations that are projected to fit
+        LLAMA_PARAMS_FIT_STATUS_FAILURE (1) - could not find allocations that are projected to fit
+        LLAMA_PARAMS_FIT_STATUS_ERROR (2) - a hard error occurred
+    """
     ...
 
 
@@ -2515,6 +2577,83 @@ def llama_state_seq_load_file(
 ) -> int: ...
 
 
+# LLAMA_API size_t llama_state_seq_get_size_ext(
+#         struct llama_context * ctx,
+#                 llama_seq_id   seq_id,
+#        llama_state_seq_flags   flags);
+@ctypes_function(
+    "llama_state_seq_get_size_ext",
+    [llama_context_p_ctypes, llama_seq_id, llama_state_seq_flags],
+    ctypes.c_size_t,
+)
+def llama_state_seq_get_size_ext(
+    ctx: llama_context_p,
+    seq_id: Union[llama_seq_id, int],
+    flags: Union[llama_state_seq_flags, int],
+    /,
+) -> int:
+    """Get size needed to copy sequence state with flags"""
+    ...
+
+
+# LLAMA_API size_t llama_state_seq_get_data_ext(
+#         struct llama_context * ctx,
+#                      uint8_t * dst,
+#                       size_t   size,
+#                 llama_seq_id   seq_id,
+#        llama_state_seq_flags   flags);
+@ctypes_function(
+    "llama_state_seq_get_data_ext",
+    [
+        llama_context_p_ctypes,
+        ctypes.POINTER(ctypes.c_uint8),
+        ctypes.c_size_t,
+        llama_seq_id,
+        llama_state_seq_flags,
+    ],
+    ctypes.c_size_t,
+)
+def llama_state_seq_get_data_ext(
+    ctx: llama_context_p,
+    dst: CtypesArray[ctypes.c_uint8],
+    size: Union[ctypes.c_size_t, int],
+    seq_id: Union[llama_seq_id, int],
+    flags: Union[llama_state_seq_flags, int],
+    /,
+) -> int:
+    """Copy sequence state to buffer with flags"""
+    ...
+
+
+# LLAMA_API size_t llama_state_seq_set_data_ext(
+#         struct llama_context * ctx,
+#                const uint8_t * src,
+#                       size_t   size,
+#                 llama_seq_id   dest_seq_id,
+#        llama_state_seq_flags   flags);
+@ctypes_function(
+    "llama_state_seq_set_data_ext",
+    [
+        llama_context_p_ctypes,
+        ctypes.POINTER(ctypes.c_uint8),
+        ctypes.c_size_t,
+        llama_seq_id,
+        llama_state_seq_flags,
+    ],
+    ctypes.c_size_t,
+)
+def llama_state_seq_set_data_ext(
+    ctx: llama_context_p,
+    src: CtypesArray[ctypes.c_uint8],
+    size: Union[ctypes.c_size_t, int],
+    dest_seq_id: Union[llama_seq_id, int],
+    flags: Union[llama_state_seq_flags, int],
+    /,
+) -> int:
+    """Restore sequence state from buffer with flags"""
+    ...
+
+
 # //
 # // Decoding
 # //