Support override-tensors parameter

dpmm99 · dpmm99 · commit 3dd6110808b1 · 2025-05-01T22:14:08.000-05:00
ggml-org/llama.cpp#11397
diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
@@ -26,6 +26,9 @@ public class ModelOptions
         /// <inheritdoc />
         public GPUSplitMode? SplitMode { get; set; }
 
+        /// <inheritdoc />
+        public List<TensorBufferOverride> TensorBufferOverrides { get; set; } = new();
+
         /// <inheritdoc />
         public int GpuLayerCount { get; set; } = 20;
 
diff --git a/LLama/Abstractions/IModelParams.cs b/LLama/Abstractions/IModelParams.cs
@@ -38,6 +38,12 @@ public interface IModelParams
         /// </summary>
         GPUSplitMode? SplitMode { get; }
 
+        /// <summary>
+        /// Buffer type overrides for specific tensor patterns, allowing you to specify hardware devices to use for individual tensors or sets of tensors.
+        /// Equivalent to --override-tensor or -ot on the llama.cpp command line or tensor_buft_overrides internally.
+        /// </summary>
+        List<TensorBufferOverride> TensorBufferOverrides { get; }
+
         /// <summary>
         /// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
         /// </summary>
diff --git a/LLama/Abstractions/TensorBufferOverride.cs b/LLama/Abstractions/TensorBufferOverride.cs
@@ -0,0 +1,36 @@
+using System;
+
+namespace LLama.Abstractions
+{
+    /// <summary>
+    /// Represents a mapping between a tensor name pattern and a specific buffer type
+    /// </summary>
+    public class TensorBufferOverride
+    {
+        /// <summary>
+        /// Pattern to match tensor names. This is a regular expression. You can check the tensor names via the model.Metadata.
+        /// </summary>
+        public string Pattern { get; set; }
+
+        /// <summary>
+        /// Buffer type to use for matching tensors. Examples: CPU, GPU0, GPU1
+        /// </summary>
+        public string BufferType { get; set; }
+
+        /// <summary>
+        /// Creates a new tensor buffer override
+        /// </summary>
+        /// <param name="pattern">Pattern to match tensor names</param>
+        /// <param name="bufferType">Buffer type to use for matching tensors</param>
+        public TensorBufferOverride(string pattern, string bufferType)
+        {
+            if (string.IsNullOrEmpty(pattern))
+                throw new ArgumentException("Pattern cannot be null or empty", nameof(pattern));
+            if (string.IsNullOrEmpty(bufferType))
+                throw new ArgumentException("Buffer type cannot be null or empty", nameof(bufferType));
+
+            Pattern = pattern;
+            BufferType = bufferType;
+        }
+    }
+}
diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
@@ -21,6 +21,9 @@ public record ModelParams
         /// <inheritdoc />
         public GPUSplitMode? SplitMode { get; set; }
 
+        /// <inheritdoc />
+        public List<TensorBufferOverride> TensorBufferOverrides { get; set; } = new();
+
         /// <inheritdoc />
         public int GpuLayerCount { get; set; } = 20;
 
diff --git a/LLama/Extensions/IModelParamsExtensions.cs b/LLama/Extensions/IModelParamsExtensions.cs
@@ -11,6 +11,8 @@ namespace LLama.Extensions;
 /// </summary>
 public static class IModelParamsExtensions
 {
+    private static LLamaTensorBufferOverrideHelper bufferOverrideHelper = new();
+
     /// <summary>
     /// Convert the given `IModelParams` into a `LLamaModelParams`
     /// </summary>
@@ -45,6 +47,19 @@ public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLam
             result.tensor_split = (float*)disposer.Add(@params.TensorSplits.Pin()).Pointer;
         }
 
+        // Add tensor buffer overrides, if any
+        if (@params.TensorBufferOverrides.Count > 0)
+        {
+            disposer.Add(bufferOverrideHelper);
+
+            foreach (var tensorOverride in @params.TensorBufferOverrides)
+            {
+                bufferOverrideHelper.AddOverride(tensorOverride.Pattern, tensorOverride.BufferType);
+            }
+
+            bufferOverrideHelper.ApplyToModelParams(ref result);
+        }
+
         if (@params.MetadataOverrides.Count == 0)
         {
             unsafe
diff --git a/LLama/Native/LLamaModelParams.cs b/LLama/Native/LLamaModelParams.cs
@@ -12,12 +12,13 @@ public unsafe struct LLamaModelParams
         /// NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
         /// todo: add support for llama_model_params.devices
         /// </summary>
-        private IntPtr devices;
+        private IntPtr devices;
 
-        // NULL-terminated list of buffer types to use for tensors that match a pattern
-        // actual type: llama_model_tensor_buft_override* 
-        // todo: add support for tensor_buft_overrides
-        private IntPtr tensor_buft_overrides;
+        /// <summary>
+        /// NULL-terminated list of buffer types to use for tensors that match a pattern
+        /// actual type: llama_model_tensor_buft_override*
+        /// </summary>
+        public IntPtr tensor_buft_overrides;
 
         /// <summary>
         /// // number of layers to store in VRAM
@@ -111,6 +112,6 @@ public static LLamaModelParams Default()
 
             [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
             static extern LLamaModelParams llama_model_default_params();
-        }
+        }
     }
 }
diff --git a/LLama/Native/LLamaModelTensorBufferOverride.cs b/LLama/Native/LLamaModelTensorBufferOverride.cs
@@ -0,0 +1,22 @@
+using System;
+
+namespace LLama.Native
+{
+    /// <summary>
+    /// Represents a mapping between a tensor name pattern and a backend buffer type<br/>
+    /// Original type: llama_model_tensor_buft_override
+    /// </summary>
+    [StructLayout(LayoutKind.Sequential)]
+    public struct LLamaModelTensorBufferOverride
+    {
+        /// <summary>
+        /// Tensor name pattern to match
+        /// </summary>
+        public IntPtr Pattern;
+
+        /// <summary>
+        /// Backend buffer type to use for matching tensors, as obtained via ggml_backend_dev_buffer_type
+        /// </summary>
+        public IntPtr BufferType;
+    }
+}
diff --git a/LLama/Native/LLamaTensorBufferOverrideHelper.cs b/LLama/Native/LLamaTensorBufferOverrideHelper.cs
@@ -0,0 +1,135 @@
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace LLama.Native
+{
+    /// <summary>
+    /// Helper for creating and managing tensor buffer overrides
+    /// </summary>
+    public class LLamaTensorBufferOverrideHelper : IDisposable
+    {
+        private readonly List<IntPtr> _allocatedMemory = new();
+        private readonly List<LLamaModelTensorBufferOverride> _overrides = new();
+        private IntPtr _overrideArray = IntPtr.Zero;
+        private readonly Dictionary<string, IntPtr> _bufferTypeCache = new();
+
+        /// <summary>
+        /// Get all available buffer types
+        /// </summary>
+        /// <returns>Dictionary mapping buffer type names to their handles</returns>
+        public Dictionary<string, IntPtr> GetAvailableBufferTypes()
+        {
+            var result = new Dictionary<string, IntPtr>();
+            
+            nuint count = NativeApi.ggml_backend_dev_count();
+            for (nuint i = 0; i < count; i++)
+            {
+                IntPtr dev = NativeApi.ggml_backend_dev_get(i);
+                IntPtr buft = NativeApi.ggml_backend_dev_buffer_type(dev);
+                
+                if (buft != IntPtr.Zero)
+                {
+                    IntPtr namePtr = NativeApi.ggml_backend_buft_name(buft);
+                    string name = Marshal.PtrToStringAnsi(namePtr) ?? string.Empty;
+                    
+                    if (!string.IsNullOrEmpty(name))
+                    {
+                        result[name] = buft;
+                        _bufferTypeCache[name] = buft;
+                    }
+                }
+            }
+            
+            return result;
+        }
+
+        /// <summary>
+        /// Add a tensor buffer override
+        /// </summary>
+        /// <param name="pattern">Tensor name pattern to match</param>
+        /// <param name="bufferTypeName">Name of the buffer type to use</param>
+        /// <returns>True if the override was added successfully</returns>
+        public bool AddOverride(string pattern, string bufferTypeName)
+        {
+            if (string.IsNullOrEmpty(pattern) || string.IsNullOrEmpty(bufferTypeName))
+                return false;
+
+            // Get all buffer types if cache is empty
+            if (_bufferTypeCache.Count == 0)
+            {
+                GetAvailableBufferTypes();
+            }
+
+            // Check if we have this buffer type
+            if (!_bufferTypeCache.TryGetValue(bufferTypeName, out IntPtr bufferType))
+                return false;
+
+            // Allocate memory for the pattern string and keep track of it
+            byte[] patternBytes = Encoding.UTF8.GetBytes(pattern + "\0");
+            IntPtr patternPtr = Marshal.AllocHGlobal(patternBytes.Length);
+            Marshal.Copy(patternBytes, 0, patternPtr, patternBytes.Length);
+            _allocatedMemory.Add(patternPtr);
+
+            // Create the override
+            var @override = new LLamaModelTensorBufferOverride
+            {
+                Pattern = patternPtr,
+                BufferType = bufferType
+            };
+
+            _overrides.Add(@override);
+            return true;
+        }
+
+        /// <summary>
+        /// Apply the overrides to model parameters
+        /// </summary>
+        /// <param name="modelParams">Model parameters to update</param>
+        public unsafe void ApplyToModelParams(ref LLamaModelParams modelParams)
+        {
+            if (_overrides.Count == 0)
+            {
+                modelParams.tensor_buft_overrides = IntPtr.Zero;
+                return;
+            }
+
+            // Free previous array if it exists
+            if (_overrideArray != IntPtr.Zero)
+            {
+                Marshal.FreeHGlobal(_overrideArray);
+            }
+
+            // Allocate memory for the array + null terminator
+            int size = Marshal.SizeOf<LLamaModelTensorBufferOverride>() * (_overrides.Count + 1);
+            _overrideArray = Marshal.AllocHGlobal(size);
+            _allocatedMemory.Add(_overrideArray);
+
+            // Copy overrides to array
+            for (int i = 0; i < _overrides.Count; i++)
+            {
+                IntPtr elemPtr = IntPtr.Add(_overrideArray, i * Marshal.SizeOf<LLamaModelTensorBufferOverride>());
+                Marshal.StructureToPtr(_overrides[i], elemPtr, false);
+            }
+
+            // Add null terminator
+            IntPtr nullTermPtr = IntPtr.Add(_overrideArray, _overrides.Count * Marshal.SizeOf<LLamaModelTensorBufferOverride>());
+            Marshal.StructureToPtr(new LLamaModelTensorBufferOverride { Pattern = IntPtr.Zero, BufferType = IntPtr.Zero }, nullTermPtr, false);
+
+            // Update model params
+            modelParams.tensor_buft_overrides = _overrideArray;
+        }
+
+        /// <inheritdoc />
+        public void Dispose()
+        {
+            foreach (IntPtr ptr in _allocatedMemory)
+            {
+                Marshal.FreeHGlobal(ptr);
+            }
+            _allocatedMemory.Clear();
+            _overrides.Clear();
+            _overrideArray = IntPtr.Zero;
+        }
+    }
+}
diff --git a/LLama/Native/NativeApi.Load.cs b/LLama/Native/NativeApi.Load.cs
@@ -107,6 +107,8 @@ private static void SetDllImportResolver()
 
         internal const string libraryName = "llama";
         internal const string llavaLibraryName = "llava_shared";
+        internal const string ggmlLibraryName = "ggml";
+        internal const string ggmlBaseLibraryName = "ggml-base";
 
         private static INativeLibrary? _loadedLLamaLibrary = null;
         private static INativeLibrary? _loadedLLavaLibrary = null;
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
@@ -439,5 +439,36 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback)
         // it would expose the raw pointer to the model, without properly wrapping it in a SafeLLamaModelHandle.
         //[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
         //public static void llama_model* llama_get_model(SafeLLamaContextHandle ctx);
+
+        /// <summary>
+        /// Get the number of available backend devices
+        /// </summary>
+        /// <returns>Count of available backend devices</returns>
+        [DllImport(ggmlLibraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern nuint ggml_backend_dev_count();
+
+        /// <summary>
+        /// Get a backend device by index
+        /// </summary>
+        /// <param name="i">Device index</param>
+        /// <returns>Pointer to the backend device</returns>
+        [DllImport(ggmlLibraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern IntPtr ggml_backend_dev_get(nuint i);
+
+        /// <summary>
+        /// Get the buffer type for a backend device
+        /// </summary>
+        /// <param name="dev">Backend device pointer</param>
+        /// <returns>Pointer to the buffer type</returns>
+        [DllImport(ggmlBaseLibraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern IntPtr ggml_backend_dev_buffer_type(IntPtr dev);
+
+        /// <summary>
+        /// Get the name of a buffer type
+        /// </summary>
+        /// <param name="buft">Buffer type pointer</param>
+        /// <returns>Name of the buffer type</returns>
+        [DllImport(ggmlBaseLibraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern IntPtr ggml_backend_buft_name(IntPtr buft);
     }
 }