From 8dc45dea8b47716194a6cd87f975fe11dcc65595 Mon Sep 17 00:00:00 2001
From: DeProgrammer <masonem99@hotmail.com>
Date: Thu, 1 May 2025 22:14:08 -0500
Subject: [PATCH 1/2] Support override-tensors parameter

https://github.com/ggml-org/llama.cpp/pull/11397
---
 LLama.Web/Common/ModelOptions.cs              |   3 +
 LLama/Abstractions/IModelParams.cs            |   6 +
 LLama/Abstractions/TensorBufferOverride.cs    |  36 +++++
 LLama/Common/ModelParams.cs                   |   3 +
 LLama/Extensions/IModelParamsExtensions.cs    |  15 ++
 LLama/Native/LLamaModelParams.cs              |   6 +
 .../Native/LLamaModelTensorBufferOverride.cs  |  22 +++
 .../Native/LLamaTensorBufferOverrideHelper.cs | 135 ++++++++++++++++++
 LLama/Native/NativeApi.Load.cs                |   2 +
 LLama/Native/NativeApi.cs                     |  31 ++++
 10 files changed, 259 insertions(+)
 create mode 100644 LLama/Abstractions/TensorBufferOverride.cs
 create mode 100644 LLama/Native/LLamaModelTensorBufferOverride.cs
 create mode 100644 LLama/Native/LLamaTensorBufferOverrideHelper.cs
diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
index a67a11a96..9824c0922 100644
--- a/LLama.Web/Common/ModelOptions.cs
+++ b/LLama.Web/Common/ModelOptions.cs
@@ -26,6 +26,9 @@ public class ModelOptions
         /// <inheritdoc />
         public GPUSplitMode? SplitMode { get; set; }
 
+        /// <inheritdoc />
+        public List<TensorBufferOverride> TensorBufferOverrides { get; set; } = new();
+
         /// <inheritdoc />
         public int GpuLayerCount { get; set; } = 20;
 
diff --git a/LLama/Abstractions/IModelParams.cs b/LLama/Abstractions/IModelParams.cs
index cbbacafe5..8a752e190 100644
--- a/LLama/Abstractions/IModelParams.cs
+++ b/LLama/Abstractions/IModelParams.cs
@@ -38,6 +38,12 @@ public interface IModelParams
         /// </summary>
         GPUSplitMode? SplitMode { get; }
 
+        /// <summary>
+        /// Buffer type overrides for specific tensor patterns, allowing you to specify hardware devices to use for individual tensors or sets of tensors.
+        /// Equivalent to --override-tensor or -ot on the llama.cpp command line or tensor_buft_overrides internally.
+        /// </summary>
+        List<TensorBufferOverride> TensorBufferOverrides { get; }
+
         /// <summary>
         /// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
         /// </summary>
diff --git a/LLama/Abstractions/TensorBufferOverride.cs b/LLama/Abstractions/TensorBufferOverride.cs
new file mode 100644
index 000000000..e8ec3f136
--- /dev/null
+++ b/LLama/Abstractions/TensorBufferOverride.cs
@@ -0,0 +1,36 @@
+using System;
+
+namespace LLama.Abstractions
+{
+    /// <summary>
+    /// Represents a mapping between a tensor name pattern and a specific buffer type
+    /// </summary>
+    public class TensorBufferOverride
+    {
+        /// <summary>
+        /// Pattern to match tensor names. This is a regular expression. You can check the tensor names via the model.Metadata.
+        /// </summary>
+        public string Pattern { get; set; }
+
+        /// <summary>
+        /// Buffer type to use for matching tensors. Examples: CPU, GPU0, GPU1
+        /// </summary>
+        public string BufferType { get; set; }
+
+        /// <summary>
+        /// Creates a new tensor buffer override
+        /// </summary>
+        /// <param name="pattern">Pattern to match tensor names</param>
+        /// <param name="bufferType">Buffer type to use for matching tensors</param>
+        public TensorBufferOverride(string pattern, string bufferType)
+        {
+            if (string.IsNullOrEmpty(pattern))
+                throw new ArgumentException("Pattern cannot be null or empty", nameof(pattern));
+            if (string.IsNullOrEmpty(bufferType))
+                throw new ArgumentException("Buffer type cannot be null or empty", nameof(bufferType));
+
+            Pattern = pattern;
+            BufferType = bufferType;
+        }
+    }
+}
diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
index 7e4b1a967..23f5681be 100644
--- a/LLama/Common/ModelParams.cs
+++ b/LLama/Common/ModelParams.cs
@@ -21,6 +21,9 @@ public record ModelParams
         /// <inheritdoc />
         public GPUSplitMode? SplitMode { get; set; }
 
+        /// <inheritdoc />
+        public List<TensorBufferOverride> TensorBufferOverrides { get; set; } = new();
+
         /// <inheritdoc />
         public int GpuLayerCount { get; set; } = 20;
 
diff --git a/LLama/Extensions/IModelParamsExtensions.cs b/LLama/Extensions/IModelParamsExtensions.cs
index 588564e33..8b71abff2 100644
--- a/LLama/Extensions/IModelParamsExtensions.cs
+++ b/LLama/Extensions/IModelParamsExtensions.cs
@@ -11,6 +11,8 @@ namespace LLama.Extensions;
 /// </summary>
 public static class IModelParamsExtensions
 {
+    private static LLamaTensorBufferOverrideHelper bufferOverrideHelper = new();
+
     /// <summary>
     /// Convert the given `IModelParams` into a `LLamaModelParams`
     /// </summary>
@@ -45,6 +47,19 @@ public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLam
             result.tensor_split = (float*)disposer.Add(@params.TensorSplits.Pin()).Pointer;
         }
 
+        // Add tensor buffer overrides, if any
+        if (@params.TensorBufferOverrides.Count > 0)
+        {
+            disposer.Add(bufferOverrideHelper);
+
+            foreach (var tensorOverride in @params.TensorBufferOverrides)
+            {
+                bufferOverrideHelper.AddOverride(tensorOverride.Pattern, tensorOverride.BufferType);
+            }
+
+            bufferOverrideHelper.ApplyToModelParams(ref result);
+        }
+
         if (@params.MetadataOverrides.Count == 0)
         {
             unsafe
diff --git a/LLama/Native/LLamaModelParams.cs b/LLama/Native/LLamaModelParams.cs
index 5159226fd..5a52d3c5c 100644
--- a/LLama/Native/LLamaModelParams.cs
+++ b/LLama/Native/LLamaModelParams.cs
@@ -13,6 +13,12 @@ public unsafe struct LLamaModelParams
         /// todo: add support for llama_model_params.devices
         /// </summary>
         private IntPtr devices;
+
+        /// <summary>
+        /// NULL-terminated list of buffer types to use for tensors that match a pattern
+        /// actual type: llama_model_tensor_buft_override*
+        /// </summary>
+        public IntPtr tensor_buft_overrides;
 
         /// <summary>
         /// // number of layers to store in VRAM
diff --git a/LLama/Native/LLamaModelTensorBufferOverride.cs b/LLama/Native/LLamaModelTensorBufferOverride.cs
new file mode 100644
index 000000000..c3a6eac6f
--- /dev/null
+++ b/LLama/Native/LLamaModelTensorBufferOverride.cs
@@ -0,0 +1,22 @@
+using System;
+
+namespace LLama.Native
+{
+    /// <summary>
+    /// Represents a mapping between a tensor name pattern and a backend buffer type<br/>
+    /// Original type: llama_model_tensor_buft_override
+    /// </summary>
+    [StructLayout(LayoutKind.Sequential)]
+    public struct LLamaModelTensorBufferOverride
+    {
+        /// <summary>
+        /// Tensor name pattern to match
+        /// </summary>
+        public IntPtr Pattern;
+
+        /// <summary>
+        /// Backend buffer type to use for matching tensors, as obtained via ggml_backend_dev_buffer_type
+        /// </summary>
+        public IntPtr BufferType;
+    }
+}
diff --git a/LLama/Native/LLamaTensorBufferOverrideHelper.cs b/LLama/Native/LLamaTensorBufferOverrideHelper.cs
new file mode 100644
index 000000000..8223cc013
--- /dev/null
+++ b/LLama/Native/LLamaTensorBufferOverrideHelper.cs
@@ -0,0 +1,135 @@
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace LLama.Native
+{
+    /// <summary>
+    /// Helper for creating and managing tensor buffer overrides
+    /// </summary>
+    public class LLamaTensorBufferOverrideHelper : IDisposable
+    {
+        private readonly List<IntPtr> _allocatedMemory = new();
+        private readonly List<LLamaModelTensorBufferOverride> _overrides = new();
+        private IntPtr _overrideArray = IntPtr.Zero;
+        private readonly Dictionary<string, IntPtr> _bufferTypeCache = new();
+
+        /// <summary>
+        /// Get all available buffer types
+        /// </summary>
+        /// <returns>Dictionary mapping buffer type names to their handles</returns>
+        public Dictionary<string, IntPtr> GetAvailableBufferTypes()
+        {
+            var result = new Dictionary<string, IntPtr>();
+            
+            nuint count = NativeApi.ggml_backend_dev_count();
+            for (nuint i = 0; i < count; i++)
+            {
+                IntPtr dev = NativeApi.ggml_backend_dev_get(i);
+                IntPtr buft = NativeApi.ggml_backend_dev_buffer_type(dev);
+                
+                if (buft != IntPtr.Zero)
+                {
+                    IntPtr namePtr = NativeApi.ggml_backend_buft_name(buft);
+                    string name = Marshal.PtrToStringAnsi(namePtr) ?? string.Empty;
+                    
+                    if (!string.IsNullOrEmpty(name))
+                    {
+                        result[name] = buft;
+                        _bufferTypeCache[name] = buft;
+                    }
+                }
+            }
+            
+            return result;
+        }
+
+        /// <summary>
+        /// Add a tensor buffer override
+        /// </summary>
+        /// <param name="pattern">Tensor name pattern to match</param>
+        /// <param name="bufferTypeName">Name of the buffer type to use</param>
+        /// <returns>True if the override was added successfully</returns>
+        public bool AddOverride(string pattern, string bufferTypeName)
+        {
+            if (string.IsNullOrEmpty(pattern) || string.IsNullOrEmpty(bufferTypeName))
+                return false;
+
+            // Get all buffer types if cache is empty
+            if (_bufferTypeCache.Count == 0)
+            {
+                GetAvailableBufferTypes();
+            }
+
+            // Check if we have this buffer type
+            if (!_bufferTypeCache.TryGetValue(bufferTypeName, out IntPtr bufferType))
+                return false;
+
+            // Allocate memory for the pattern string and keep track of it
+            byte[] patternBytes = Encoding.UTF8.GetBytes(pattern + "\0");
+            IntPtr patternPtr = Marshal.AllocHGlobal(patternBytes.Length);
+            Marshal.Copy(patternBytes, 0, patternPtr, patternBytes.Length);
+            _allocatedMemory.Add(patternPtr);
+
+            // Create the override
+            var @override = new LLamaModelTensorBufferOverride
+            {
+                Pattern = patternPtr,
+                BufferType = bufferType
+            };
+
+            _overrides.Add(@override);
+            return true;
+        }
+
+        /// <summary>
+        /// Apply the overrides to model parameters
+        /// </summary>
+        /// <param name="modelParams">Model parameters to update</param>
+        public unsafe void ApplyToModelParams(ref LLamaModelParams modelParams)
+        {
+            if (_overrides.Count == 0)
+            {
+                modelParams.tensor_buft_overrides = IntPtr.Zero;
+                return;
+            }
+
+            // Free previous array if it exists
+            if (_overrideArray != IntPtr.Zero)
+            {
+                Marshal.FreeHGlobal(_overrideArray);
+            }
+
+            // Allocate memory for the array + null terminator
+            int size = Marshal.SizeOf<LLamaModelTensorBufferOverride>() * (_overrides.Count + 1);
+            _overrideArray = Marshal.AllocHGlobal(size);
+            _allocatedMemory.Add(_overrideArray);
+
+            // Copy overrides to array
+            for (int i = 0; i < _overrides.Count; i++)
+            {
+                IntPtr elemPtr = IntPtr.Add(_overrideArray, i * Marshal.SizeOf<LLamaModelTensorBufferOverride>());
+                Marshal.StructureToPtr(_overrides[i], elemPtr, false);
+            }
+
+            // Add null terminator
+            IntPtr nullTermPtr = IntPtr.Add(_overrideArray, _overrides.Count * Marshal.SizeOf<LLamaModelTensorBufferOverride>());
+            Marshal.StructureToPtr(new LLamaModelTensorBufferOverride { Pattern = IntPtr.Zero, BufferType = IntPtr.Zero }, nullTermPtr, false);
+
+            // Update model params
+            modelParams.tensor_buft_overrides = _overrideArray;
+        }
+
+        /// <inheritdoc />
+        public void Dispose()
+        {
+            foreach (IntPtr ptr in _allocatedMemory)
+            {
+                Marshal.FreeHGlobal(ptr);
+            }
+            _allocatedMemory.Clear();
+            _overrides.Clear();
+            _overrideArray = IntPtr.Zero;
+        }
+    }
+}
diff --git a/LLama/Native/NativeApi.Load.cs b/LLama/Native/NativeApi.Load.cs
index 5ad30d032..ffe3233d5 100644
--- a/LLama/Native/NativeApi.Load.cs
+++ b/LLama/Native/NativeApi.Load.cs
@@ -101,6 +101,8 @@ private static void SetDllImportResolver()
 
         internal const string libraryName = "llama";
         internal const string llavaLibraryName = "llava_shared";
+        internal const string ggmlLibraryName = "ggml";
+        internal const string ggmlBaseLibraryName = "ggml-base";
 
         private static INativeLibrary? _loadedLLamaLibrary = null;
         private static INativeLibrary? _loadedLLavaLibrary = null;
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
index 4c788b7a0..4bd32ef09 100644
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -439,5 +439,36 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback)
         // it would expose the raw pointer to the model, without properly wrapping it in a SafeLLamaModelHandle.
         //[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
         //public static void llama_model* llama_get_model(SafeLLamaContextHandle ctx);
+
+        /// <summary>
+        /// Get the number of available backend devices
+        /// </summary>
+        /// <returns>Count of available backend devices</returns>
+        [DllImport(ggmlLibraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern nuint ggml_backend_dev_count();
+
+        /// <summary>
+        /// Get a backend device by index
+        /// </summary>
+        /// <param name="i">Device index</param>
+        /// <returns>Pointer to the backend device</returns>
+        [DllImport(ggmlLibraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern IntPtr ggml_backend_dev_get(nuint i);
+
+        /// <summary>
+        /// Get the buffer type for a backend device
+        /// </summary>
+        /// <param name="dev">Backend device pointer</param>
+        /// <returns>Pointer to the buffer type</returns>
+        [DllImport(ggmlBaseLibraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern IntPtr ggml_backend_dev_buffer_type(IntPtr dev);
+
+        /// <summary>
+        /// Get the name of a buffer type
+        /// </summary>
+        /// <param name="buft">Buffer type pointer</param>
+        /// <returns>Name of the buffer type</returns>
+        [DllImport(ggmlBaseLibraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern IntPtr ggml_backend_buft_name(IntPtr buft);
     }
 }

From 1ac27e2255c55a7b50796abfdc88517fb43fbc6b Mon Sep 17 00:00:00 2001
From: DeProgrammer <masonem99@hotmail.com>
Date: Fri, 2 May 2025 17:59:45 -0500
Subject: [PATCH 2/2] Suppress 'ot' in _typos.toml

---
 .github/_typos.toml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/_typos.toml b/.github/_typos.toml
index fb576b499..d1f4f98b1 100644
--- a/.github/_typos.toml
+++ b/.github/_typos.toml
@@ -17,4 +17,6 @@ extend-exclude = [
 
 [default.extend-words]
 # Used in a comment in SafeLLamaSamplerHandle.cs, as a prefix of "hello"
-teh = "hel"
\ No newline at end of file
+teh = "hel"
+# ot is the shorthand version of llama.cpp's override-tensor parameter
+ot = "ot"