dotnet
diff --git a/‎THIRD-PARTY-NOTICES.TXT
+18 b/‎THIRD-PARTY-NOTICES.TXT
+18
diff --git a/‎src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj
+1 b/‎src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj
+1
diff --git a/‎src/Microsoft.ML.Tokenizers/Model/SentencePieceBpe.cs
+906 b/‎src/Microsoft.ML.Tokenizers/Model/SentencePieceBpe.cs
+906
diff --git a/‎src/Microsoft.ML.Tokenizers/Model/Tiktoken.cs
+32-186 b/‎src/Microsoft.ML.Tokenizers/Model/Tiktoken.cs
+32-186
diff --git a/‎src/Microsoft.ML.Tokenizers/Normalizer/LlamaNormalizer.cs
+128 b/‎src/Microsoft.ML.Tokenizers/Normalizer/LlamaNormalizer.cs
+128
diff --git a/‎src/Microsoft.ML.Tokenizers/PreTokenizer/SentencePiecePreTokenizer.cs
+36 b/‎src/Microsoft.ML.Tokenizers/PreTokenizer/SentencePiecePreTokenizer.cs
+36
@@ -86,6 +86,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 
+License notice for SentencePiece
+---------------------------------
+
+https://github.com/google/sentencepiece/blob/master/LICENSE
+
+Copyright 2016 Google Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.!
 
 License notice for BitUtility
 ------------------------------------------
 
@@ -17,6 +17,7 @@
   </ItemGroup>
 
   <ItemGroup>
+    <PackageReference Include="Google.Protobuf" Version="$(GoogleProtobufVersion)" />
     <PackageReference Include="System.Text.Json" Version="$(SystemTextJsonVersion)" />
   </ItemGroup>
 
 
@@ -0,0 +1,128 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Buffers;
+
+namespace Microsoft.ML.Tokenizers
+{
+    /// <summary>
+    /// Normalize the string to lowercase form before processing it with the tokenizer.
+    /// </summary>
+    public sealed class LlamaNormalizer : Normalizer
+    {
+        internal const char DummyPrefix = '\u2581'; // '▁' (LOWER ONE EIGHT BLOCK)
+
+        /// <summary>
+        /// Creates a LowerCaseNormalizer object.
+        /// </summary>
+        public LlamaNormalizer(bool removeExtraWhiteSpaces, bool addDummyPrefix, bool escapeWhiteSpaces, bool treatWhitespaceAsSuffix)
+        {
+            RemoveExtraWhiteSpaces = removeExtraWhiteSpaces;
+            AddDummyPrefix = addDummyPrefix;
+            EscapeWhiteSpaces = escapeWhiteSpaces;
+            TreatWhitespaceAsSuffix = treatWhitespaceAsSuffix;
+        }
+
+        /// <summary>
+        /// Indicate removing extra white spaces from the original string during the normalization.
+        /// </summary>
+        public bool RemoveExtraWhiteSpaces { get; }
+
+        /// <summary>
+        /// Indicate emitting the dummy prefix character U+2581 at the beginning of sentence token during the encoding.
+        /// </summary>
+        public bool AddDummyPrefix { get; }
+
+        public bool EscapeWhiteSpaces { get; }
+
+        public bool TreatWhitespaceAsSuffix { get; }
+
+        /// <summary>
+        /// Normalize the original string according to SentencePiece normalization with Llama model.
+        /// </summary>
+        /// <param name="original">The original string to normalize.</param>
+        /// <returns>The normalized string.</returns>
+        public override string Normalize(string original)
+        {
+            if (string.IsNullOrEmpty(original))
+            {
+                return string.Empty;
+            }
+
+            int startIndex = 0;
+            int endIndex = original.Length - 1;
+
+            if (RemoveExtraWhiteSpaces)
+            {
+                while (startIndex <= endIndex && original[startIndex] == ' ')
+                {
+                    startIndex++;
+                }
+
+                while (endIndex >= startIndex && original[endIndex] == ' ')
+                {
+                    endIndex--;
+                }
+
+                if (startIndex == endIndex)
+                {
+                    return string.Empty;
+                }
+            }
+
+            int length = endIndex - startIndex + 1;
+
+            Span<char> span = stackalloc char[512];
+            char[]? buffer = null;
+
+            if (span.Length < length + 1)
+            {
+                // Add dummy prefix if needed
+                buffer = ArrayPool<char>.Shared.Rent(AddDummyPrefix ? length + 1 : length);
+                span = buffer;
+            }
+
+            int bufferIndex = 0;
+            if (AddDummyPrefix && !TreatWhitespaceAsSuffix)
+            {
+                span[bufferIndex++] = EscapeWhiteSpaces ? DummyPrefix : ' ';
+            }
+
+            while (startIndex <= endIndex)
+            {
+                char c = original[startIndex++];
+                if (c == ' ')
+                {
+                    span[bufferIndex++] = EscapeWhiteSpaces ? DummyPrefix : c;
+
+                    if (RemoveExtraWhiteSpaces)
+                    {
+                        while (startIndex <= endIndex && original[startIndex] == ' ')
+                        {
+                            startIndex++;
+                        }
+                    }
+                }
+                else
+                {
+                    span[bufferIndex++] = c;
+                }
+            }
+
+            if (AddDummyPrefix && TreatWhitespaceAsSuffix)
+            {
+                span[bufferIndex++] = EscapeWhiteSpaces ? DummyPrefix : ' ';
+            }
+
+            string result = span.Slice(0, bufferIndex).ToString();
+
+            if (buffer is not null)
+            {
+                ArrayPool<char>.Shared.Return(buffer);
+            }
+            return result;
+        }
+    }
+}
@@ -0,0 +1,36 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+
+namespace Microsoft.ML.Tokenizers
+{
+    /// <summary>
+    /// The pre-tokenizer for SentencePiece tokenizers.
+    /// </summary>
+    internal sealed partial class SentencePiecePreTokenizer : PreTokenizer
+    {
+        /// <summary>
+        /// Gets a singleton instance of the Roberta pre-tokenizer..
+        /// </summary>
+        public static SentencePiecePreTokenizer Instance { get; } = new SentencePiecePreTokenizer();
+
+        /// <summary>
+        /// Return the whole text as one chunk.
+        /// </summary>
+        /// <param name="text">The string to split into tokens.</param>
+        /// <param name="considerSpecialTokens">Indicates whether to keep the special tokens.</param>
+        /// <returns>The original string as one chunk.</returns>
+        public override IEnumerable<Split> PreTokenize(string text, bool considerSpecialTokens = true)
+        {
+            if (string.IsNullOrEmpty(text))
+            {
+                yield break;
+            }
+
+            yield return new Split(text, (0, text.Length));
+        }
+    }
+}