Skip to content

Commit 70e191b

Browse files
authored
Introducing Llama Tokenizer (#7078)
* Introducing Llama Tokenizer * Add more tests * Dynamically detect the byte encoding offset to the Id. * Address the feedback
1 parent cea9d90 commit 70e191b

15 files changed

+6746
-200
lines changed

THIRD-PARTY-NOTICES.TXT

+18
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8686
See the License for the specific language governing permissions and
8787
limitations under the License.
8888

89+
License notice for SentencePiece
90+
---------------------------------
91+
92+
https://github.com/google/sentencepiece/blob/master/LICENSE
93+
94+
Copyright 2016 Google Inc.
95+
96+
Licensed under the Apache License, Version 2.0 (the "License");
97+
you may not use this file except in compliance with the License.
98+
You may obtain a copy of the License at
99+
100+
http://www.apache.org/licenses/LICENSE-2.0
101+
102+
Unless required by applicable law or agreed to in writing, software
103+
distributed under the License is distributed on an "AS IS" BASIS,
104+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
105+
See the License for the specific language governing permissions and
106+
limitations under the License.!
89107

90108
License notice for BitUtility
91109
------------------------------------------

src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
</ItemGroup>
1818

1919
<ItemGroup>
20+
<PackageReference Include="Google.Protobuf" Version="$(GoogleProtobufVersion)" />
2021
<PackageReference Include="System.Text.Json" Version="$(SystemTextJsonVersion)" />
2122
</ItemGroup>
2223

src/Microsoft.ML.Tokenizers/Model/SentencePieceBpe.cs

+906
Large diffs are not rendered by default.

src/Microsoft.ML.Tokenizers/Model/Tiktoken.cs

+32-186
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
using System;
6+
using System.Buffers;
7+
8+
namespace Microsoft.ML.Tokenizers
9+
{
10+
/// <summary>
11+
/// Normalize the string to lowercase form before processing it with the tokenizer.
12+
/// </summary>
13+
public sealed class LlamaNormalizer : Normalizer
14+
{
15+
internal const char DummyPrefix = '\u2581'; // '▁' (LOWER ONE EIGHT BLOCK)
16+
17+
/// <summary>
18+
/// Creates a LowerCaseNormalizer object.
19+
/// </summary>
20+
public LlamaNormalizer(bool removeExtraWhiteSpaces, bool addDummyPrefix, bool escapeWhiteSpaces, bool treatWhitespaceAsSuffix)
21+
{
22+
RemoveExtraWhiteSpaces = removeExtraWhiteSpaces;
23+
AddDummyPrefix = addDummyPrefix;
24+
EscapeWhiteSpaces = escapeWhiteSpaces;
25+
TreatWhitespaceAsSuffix = treatWhitespaceAsSuffix;
26+
}
27+
28+
/// <summary>
29+
/// Indicate removing extra white spaces from the original string during the normalization.
30+
/// </summary>
31+
public bool RemoveExtraWhiteSpaces { get; }
32+
33+
/// <summary>
34+
/// Indicate emitting the dummy prefix character U+2581 at the beginning of sentence token during the encoding.
35+
/// </summary>
36+
public bool AddDummyPrefix { get; }
37+
38+
public bool EscapeWhiteSpaces { get; }
39+
40+
public bool TreatWhitespaceAsSuffix { get; }
41+
42+
/// <summary>
43+
/// Normalize the original string according to SentencePiece normalization with Llama model.
44+
/// </summary>
45+
/// <param name="original">The original string to normalize.</param>
46+
/// <returns>The normalized string.</returns>
47+
public override string Normalize(string original)
48+
{
49+
if (string.IsNullOrEmpty(original))
50+
{
51+
return string.Empty;
52+
}
53+
54+
int startIndex = 0;
55+
int endIndex = original.Length - 1;
56+
57+
if (RemoveExtraWhiteSpaces)
58+
{
59+
while (startIndex <= endIndex && original[startIndex] == ' ')
60+
{
61+
startIndex++;
62+
}
63+
64+
while (endIndex >= startIndex && original[endIndex] == ' ')
65+
{
66+
endIndex--;
67+
}
68+
69+
if (startIndex == endIndex)
70+
{
71+
return string.Empty;
72+
}
73+
}
74+
75+
int length = endIndex - startIndex + 1;
76+
77+
Span<char> span = stackalloc char[512];
78+
char[]? buffer = null;
79+
80+
if (span.Length < length + 1)
81+
{
82+
// Add dummy prefix if needed
83+
buffer = ArrayPool<char>.Shared.Rent(AddDummyPrefix ? length + 1 : length);
84+
span = buffer;
85+
}
86+
87+
int bufferIndex = 0;
88+
if (AddDummyPrefix && !TreatWhitespaceAsSuffix)
89+
{
90+
span[bufferIndex++] = EscapeWhiteSpaces ? DummyPrefix : ' ';
91+
}
92+
93+
while (startIndex <= endIndex)
94+
{
95+
char c = original[startIndex++];
96+
if (c == ' ')
97+
{
98+
span[bufferIndex++] = EscapeWhiteSpaces ? DummyPrefix : c;
99+
100+
if (RemoveExtraWhiteSpaces)
101+
{
102+
while (startIndex <= endIndex && original[startIndex] == ' ')
103+
{
104+
startIndex++;
105+
}
106+
}
107+
}
108+
else
109+
{
110+
span[bufferIndex++] = c;
111+
}
112+
}
113+
114+
if (AddDummyPrefix && TreatWhitespaceAsSuffix)
115+
{
116+
span[bufferIndex++] = EscapeWhiteSpaces ? DummyPrefix : ' ';
117+
}
118+
119+
string result = span.Slice(0, bufferIndex).ToString();
120+
121+
if (buffer is not null)
122+
{
123+
ArrayPool<char>.Shared.Return(buffer);
124+
}
125+
return result;
126+
}
127+
}
128+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
using System;
6+
using System.Collections.Generic;
7+
8+
namespace Microsoft.ML.Tokenizers
9+
{
10+
/// <summary>
11+
/// The pre-tokenizer for SentencePiece tokenizers.
12+
/// </summary>
13+
internal sealed partial class SentencePiecePreTokenizer : PreTokenizer
14+
{
15+
/// <summary>
16+
/// Gets a singleton instance of the Roberta pre-tokenizer..
17+
/// </summary>
18+
public static SentencePiecePreTokenizer Instance { get; } = new SentencePiecePreTokenizer();
19+
20+
/// <summary>
21+
/// Return the whole text as one chunk.
22+
/// </summary>
23+
/// <param name="text">The string to split into tokens.</param>
24+
/// <param name="considerSpecialTokens">Indicates whether to keep the special tokens.</param>
25+
/// <returns>The original string as one chunk.</returns>
26+
public override IEnumerable<Split> PreTokenize(string text, bool considerSpecialTokens = true)
27+
{
28+
if (string.IsNullOrEmpty(text))
29+
{
30+
yield break;
31+
}
32+
33+
yield return new Split(text, (0, text.Length));
34+
}
35+
}
36+
}

0 commit comments

Comments
 (0)