Skip to content

Added support for overriding tensor buffer types #2007

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions llama_cpp/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ def __init__(
# Misc
spm_infill: bool = False,
verbose: bool = True,
override_tensor: Optional[str] = None,
# Extra Params
**kwargs, # type: ignore
):
Expand Down Expand Up @@ -187,6 +188,7 @@ def __init__(
type_k: KV cache data type for K (default: f16)
type_v: KV cache data type for V (default: f16)
spm_infill: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
override_tensor: <tensor name pattern>=<buffer type>,...

Raises:
ValueError: If the model path does not exist.
Expand Down Expand Up @@ -364,6 +366,46 @@ def __init__(

self.spm_infill = spm_infill

self._c_tensor_buft_overrides = None
if override_tensor:

buft_overrides = []
buft_list = {}
# Enumerate all devices and add their buffer types to the list
for i in range(llama_cpp.ggml_backend_dev_count()):
dev = llama_cpp.ggml_backend_dev_get(i)
buft = llama_cpp.ggml_backend_dev_buffer_type(dev)
if buft:
buft_name = llama_cpp.ggml_backend_buft_name(buft).decode('utf-8')
buft_list[buft_name] = buft

# Process overrides
for override in override_tensor.split(','):
pos = override.find('=')
if pos == -1:
raise ValueError("invalid value")

tensor_name = override[:pos]
buffer_type = override[pos+1:]

if buffer_type not in buft_list:
print("Available buffer types:")
for name in buft_list:
print(f" {name}")
raise ValueError("unknown buffer type")

buft_overrides.append(
llama_cpp.llama_model_tensor_buft_override(
pattern=tensor_name.encode('utf-8'),
buft=buft_list[buffer_type]
)
)
array_type = llama_cpp.llama_model_tensor_buft_override * (len(buft_overrides) + 1)
self._c_tensor_buft_overrides = array_type(
*buft_overrides
)
self.model_params.tensor_buft_overrides = self._c_tensor_buft_overrides

if not os.path.exists(model_path):
raise ValueError(f"Model path does not exist: {model_path}")

Expand Down
88 changes: 86 additions & 2 deletions llama_cpp/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -640,10 +640,94 @@ class llama_model_kv_override(ctypes.Structure):
value: Union[int, float, bool, bytes]



# struct ggml_backend_buffer_type_i {
# const char * (*get_name) (ggml_backend_buffer_type_t buft);
# // allocate a buffer of this type
# ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
# // tensor alignment
# size_t (*get_alignment) (ggml_backend_buffer_type_t buft);
# // (optional) max buffer size that can be allocated (defaults to SIZE_MAX)
# size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
# // (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
# size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
# // (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
# bool (*is_host) (ggml_backend_buffer_type_t buft);
# };
class ggml_backend_buffer_type_i(ctypes.Structure):
_fields_ = [
("get_name", ctypes.c_void_p), # NOTE: Unused
("alloc_buffer", ctypes.c_void_p), # NOTE: Unused
("get_alignment", ctypes.c_void_p), # NOTE: Unused
("get_max_size", ctypes.c_void_p), # NOTE: Unused
("get_alloc_size", ctypes.c_void_p), # NOTE: Unused
("is_host", ctypes.c_void_p) # NOTE: Unused
]

# typedef struct ggml_backend_device * ggml_backend_dev_t;
ggml_backend_dev_t = ctypes.c_void_p # NOTE: Unused

# struct ggml_backend_buffer_type {
# struct ggml_backend_buffer_type_i iface;
# ggml_backend_dev_t device;
# void * context;
# };
class ggml_backend_buffer_type(ctypes.Structure):
_fields_ = [
("iface", ggml_backend_buffer_type_i),
("device", ggml_backend_dev_t),
("context", ctypes.c_void_p)
]

# typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
ggml_backend_buffer_type_t = ctypes.POINTER(ggml_backend_buffer_type)

# struct llama_model_tensor_buft_override {
# const char * pattern;
# ggml_backend_buffer_type_t buft;
# };
class llama_model_tensor_buft_override(ctypes.Structure):
_fields_ = [
("pattern", ctypes.c_char_p),
("buft", ggml_backend_buffer_type_t),
]


# GGML_API size_t ggml_backend_dev_count(void);
@ctypes_function(
"ggml_backend_dev_count",
[],
ctypes.c_size_t,
)
def ggml_backend_dev_count() -> int:
...

# GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index);
@ctypes_function(
"ggml_backend_dev_get",
[ctypes.c_size_t],
ggml_backend_dev_t,
)
def ggml_backend_dev_get(index: int, /) -> ggml_backend_dev_t:
...

# GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
@ctypes_function(
"ggml_backend_dev_buffer_type",
[ggml_backend_dev_t],
ggml_backend_buffer_type_t,
)
def ggml_backend_dev_buffer_type(device: ggml_backend_dev_t, /) -> ggml_backend_buffer_type_t:
...

# GGML_API const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft);
@ctypes_function(
"ggml_backend_buft_name",
[ggml_backend_buffer_type_t],
ctypes.c_char_p,
)
def ggml_backend_buft_name(buft: ggml_backend_buffer_type_t, /) -> bytes:
...


# struct llama_model_params {
Expand Down Expand Up @@ -703,7 +787,7 @@ class llama_model_params(ctypes.Structure):

if TYPE_CHECKING:
devices: CtypesArray[ctypes.c_void_p] # NOTE: unused
tensor_buft_overrides: CtypesArray[llama_model_tensor_buft_override] # NOTE: unused
tensor_buft_overrides: CtypesArray[llama_model_tensor_buft_override]
n_gpu_layers: int
split_mode: int
main_gpu: int
Expand All @@ -718,7 +802,7 @@ class llama_model_params(ctypes.Structure):

_fields_ = [
("devices", ctypes.c_void_p), # NOTE: unnused
("tensor_buft_overrides", ctypes.c_void_p), # NOTE: unused
("tensor_buft_overrides", ctypes.POINTER(llama_model_tensor_buft_override)),
("n_gpu_layers", ctypes.c_int32),
("split_mode", ctypes.c_int),
("main_gpu", ctypes.c_int32),
Expand Down