From b13e0bcaf7132127c1e9caee360a90bdb52c76bc Mon Sep 17 00:00:00 2001 From: zpin Date: Tue, 29 Apr 2025 16:01:50 +0200 Subject: [PATCH 1/2] Added support for overriding tensor buffer types --- llama_cpp/llama.py | 42 ++++++++++++++++++++ llama_cpp/llama_cpp.py | 88 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 128 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 7e9a6af23..9ad0805f6 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -115,6 +115,7 @@ def __init__( # Misc spm_infill: bool = False, verbose: bool = True, + override_tensor: Optional[str] = None, # Extra Params **kwargs, # type: ignore ): @@ -187,6 +188,7 @@ def __init__( type_k: KV cache data type for K (default: f16) type_v: KV cache data type for V (default: f16) spm_infill: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. + override_tensor: =,... Raises: ValueError: If the model path does not exist. @@ -364,6 +366,46 @@ def __init__( self.spm_infill = spm_infill + self._c_tensor_buft_overrides = None + if override_tensor is not None: + + buft_overrides = [] + buft_list = {} + # Enumerate all devices and add their buffer types to the list + for i in range(llama_cpp.ggml_backend_dev_count()): + dev = llama_cpp.ggml_backend_dev_get(i) + buft = llama_cpp.ggml_backend_dev_buffer_type(dev) + if buft: + buft_name = llama_cpp.ggml_backend_buft_name(buft).decode('utf-8') + buft_list[buft_name] = buft + + # Process overrides + for override in override_tensor.split(','): + pos = override.find('=') + if pos == -1: + raise ValueError("invalid value") + + tensor_name = override[:pos] + buffer_type = override[pos+1:] + + if buffer_type not in buft_list: + print("Available buffer types:") + for name in buft_list: + print(f" {name}") + raise ValueError("unknown buffer type") + + buft_overrides.append( + llama_cpp.llama_model_tensor_buft_override( + pattern=tensor_name.encode('utf-8'), + buft=buft_list[buffer_type] + ) + ) + array_type = llama_cpp.llama_model_tensor_buft_override * (len(buft_overrides) + 1) + self._c_tensor_buft_overrides = array_type( + *buft_overrides + ) + self.model_params.tensor_buft_overrides = self._c_tensor_buft_overrides + if not os.path.exists(model_path): raise ValueError(f"Model path does not exist: {model_path}") diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 710bd83c8..04a409a81 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -640,10 +640,94 @@ class llama_model_kv_override(ctypes.Structure): value: Union[int, float, bool, bytes] + +# struct ggml_backend_buffer_type_i { +# const char * (*get_name) (ggml_backend_buffer_type_t buft); +# // allocate a buffer of this type +# ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); +# // tensor alignment +# size_t (*get_alignment) (ggml_backend_buffer_type_t buft); +# // (optional) max buffer size that can be allocated (defaults to SIZE_MAX) +# size_t (*get_max_size) (ggml_backend_buffer_type_t buft); +# // (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes) +# size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); +# // (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false) +# bool (*is_host) (ggml_backend_buffer_type_t buft); +# }; +class ggml_backend_buffer_type_i(ctypes.Structure): + _fields_ = [ + ("get_name", ctypes.c_void_p), # NOTE: Unused + ("alloc_buffer", ctypes.c_void_p), # NOTE: Unused + ("get_alignment", ctypes.c_void_p), # NOTE: Unused + ("get_max_size", ctypes.c_void_p), # NOTE: Unused + ("get_alloc_size", ctypes.c_void_p), # NOTE: Unused + ("is_host", ctypes.c_void_p) # NOTE: Unused + ] + +# typedef struct ggml_backend_device * ggml_backend_dev_t; +ggml_backend_dev_t = ctypes.c_void_p # NOTE: Unused + +# struct ggml_backend_buffer_type { +# struct ggml_backend_buffer_type_i iface; +# ggml_backend_dev_t device; +# void * context; +# }; +class ggml_backend_buffer_type(ctypes.Structure): + _fields_ = [ + ("iface", ggml_backend_buffer_type_i), + ("device", ggml_backend_dev_t), + ("context", ctypes.c_void_p) + ] + +# typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t; +ggml_backend_buffer_type_t = ctypes.POINTER(ggml_backend_buffer_type) + # struct llama_model_tensor_buft_override { # const char * pattern; # ggml_backend_buffer_type_t buft; # }; +class llama_model_tensor_buft_override(ctypes.Structure): + _fields_ = [ + ("pattern", ctypes.c_char_p), + ("buft", ggml_backend_buffer_type_t), + ] + + +# GGML_API size_t ggml_backend_dev_count(void); +@ctypes_function( + "ggml_backend_dev_count", + [], + ctypes.c_size_t, +) +def ggml_backend_dev_count() -> int: + ... + +# GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index); +@ctypes_function( + "ggml_backend_dev_get", + [ctypes.c_size_t], + ggml_backend_dev_t, +) +def ggml_backend_dev_get(index: int, /) -> ggml_backend_dev_t: + ... + +# GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device); +@ctypes_function( + "ggml_backend_dev_buffer_type", + [ggml_backend_dev_t], + ggml_backend_buffer_type_t, +) +def ggml_backend_dev_buffer_type(device: ggml_backend_dev_t, /) -> ggml_backend_buffer_type_t: + ... + +# GGML_API const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft); +@ctypes_function( + "ggml_backend_buft_name", + [ggml_backend_buffer_type_t], + ctypes.c_char_p, +) +def ggml_backend_buft_name(buft: ggml_backend_buffer_type_t, /) -> bytes: + ... # struct llama_model_params { @@ -703,7 +787,7 @@ class llama_model_params(ctypes.Structure): if TYPE_CHECKING: devices: CtypesArray[ctypes.c_void_p] # NOTE: unused - tensor_buft_overrides: CtypesArray[llama_model_tensor_buft_override] # NOTE: unused + tensor_buft_overrides: CtypesArray[llama_model_tensor_buft_override] n_gpu_layers: int split_mode: int main_gpu: int @@ -718,7 +802,7 @@ class llama_model_params(ctypes.Structure): _fields_ = [ ("devices", ctypes.c_void_p), # NOTE: unnused - ("tensor_buft_overrides", ctypes.c_void_p), # NOTE: unused + ("tensor_buft_overrides", ctypes.POINTER(llama_model_tensor_buft_override)), ("n_gpu_layers", ctypes.c_int32), ("split_mode", ctypes.c_int), ("main_gpu", ctypes.c_int32), From a5523bdd0f1bd98d3d7f45dd6934cd1bf6fa72c2 Mon Sep 17 00:00:00 2001 From: zpin Date: Wed, 30 Apr 2025 22:40:57 +0200 Subject: [PATCH 2/2] Only apply override_tensor when non-empty --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 9ad0805f6..c0aac2d7b 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -367,7 +367,7 @@ def __init__( self.spm_infill = spm_infill self._c_tensor_buft_overrides = None - if override_tensor is not None: + if override_tensor: buft_overrides = [] buft_list = {}