8000 Create SentencePieceTokenizer from options object (#7403) · dotnet/machinelearning@0807bd8 · GitHub
[go: up one dir, main page]

Skip to content

Commit 0807bd8

Browse files
authored
Create SentencePieceTokenizer from options object (#7403)
* Create SentencePieceTokenizer from options object * Address the feedback
1 parent 2faac1e commit 0807bd8

File tree

7 files changed

+643
-31
lines changed

7 files changed

+643
-31
lines changed

eng/Versions.props

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@
100100
<MicrosoftMLTensorFlowTestModelsVersion>0.0.13-test</MicrosoftMLTensorFlowTestModelsVersion>
101101
<MicrosoftMLTestDatabasesVersion>0.0.6-test</MicrosoftMLTestDatabasesVersion>
102102
<MicrosoftMLTestModelsVersion>0.0.7-test</MicrosoftMLTestModelsVersion>
103-
<MicrosoftMLTestTokenizersVersion>2.0.0-beta.25110.1</MicrosoftMLTestTokenizersVersion>
103+
<MicrosoftMLTestTokenizersVersion>2.0.0-beta.25126.1</MicrosoftMLTestTokenizersVersion>
104104
<SystemDataSqlClientVersion>4.9.0</SystemDataSqlClientVersion>
105105
<SystemDataSQLiteCoreVersion>1.0.118</SystemDataSQLiteCoreVersion>
106106
<XunitCombinatorialVersion>1.6.24</XunitCombinatorialVersion>

src/Microsoft.ML.Tokenizers/Model/SentencePieceBaseModel.cs

Lines changed: 64 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,67 @@ internal SentencePieceBaseModel(ModelProto modelProto, bool addBos = false, bool
5959
specialTokens);
6060
}
6161

62+
internal SentencePieceBaseModel(SentencePieceOptions options)
63+
{
64+
if (options is null)
65+
{
66+
throw new ArgumentNullException(nameof(options));
67+
}
68+
69+
if (options.Vocabulary is null)
70+
{
71+
throw new ArgumentNullException(nameof(options.Vocabulary));
72+
}
73+
74+
if (options.BeginningOfSentenceToken is null)
75+
{
76+
throw new ArgumentNullException(nameof(options.BeginningOfSentenceToken));
77+
}
78+
79+
if (options.EndOfSentenceToken is null)
80+
{
81+
throw new ArgumentNullException(nameof(options.EndOfSentenceToken));
82+
}
83+
84+
if (options.UnknownToken is null)
85+
{
86+
throw new ArgumentNullException(nameof(options.UnknownToken));
87+
}
88+
89+
AddBeginningOfSentence = options.AddBeginningOfSentence;
90+
AddEndOfSentence = options.AddEndOfSentence;
91+
BeginningOfSentenceToken = options.BeginningOfSentenceToken;
92+
EndOfSentenceToken = options.EndOfSentenceToken;
93+
UnknownToken = options.UnknownToken;
94+
AddDummyPrefix = options.AddDummyPrefix;
95+
EscapeWhiteSpaces = options.EscapeWhiteSpaces;
96+
TreatWhitespaceAsSuffix = options.TreatWhitespaceAsSuffix;
97+
ByteFallback = options.ByteFallback;
98+
SpecialTokens = options.SpecialTokens;
99+
100+
if (SpecialTokens is not null && SpecialTokens.Count > 0)
101+
{
102+
InternalSpecialTokens = new Dictionary<StringSpanOrdinalKey, int>();
103+
SpecialTokensReverse = new Dictionary<int, string>();
104+
105< 8000 code class="diff-text syntax-highlighted-line addition">+
foreach (var item in SpecialTokens)
106+
{
107+
InternalSpecialTokens.Add(new StringSpanOrdinalKey(item.Key), item.Value);
108+
SpecialTokensReverse.Add(item.Value, item.Key);
109+
}
110+
111+
// We create this Regex object without a timeout, as we expect the match operation to complete in O(N) time complexity. Note that `specialTokens` are treated as constants after the tokenizer is created.
112+
SpecialTokensRegex = new Regex(string.Join("|", SpecialTokens.Keys.Select(s => Regex.Escape(s))), RegexOptions.Compiled);
113+
}
114+
115+
Normalizer = new SentencePieceNormalizer(
116+
options.PrecompiledNormalizationData,
117+
options.RemoveExtraWhiteSpaces,
118+
options.AddDummyPrefix, options.EscapeWhiteSpaces,
119+
options.TreatWhitespaceAsSuffix,
120+
SpecialTokens);
121+
}
122+
62123
internal Regex? SpecialTokensRegex { get; }
63124

64125
internal Dictionary<StringSpanOrdinalKey, int>? InternalSpecialTokens { get; }
@@ -91,11 +152,11 @@ internal SentencePieceBaseModel(ModelProto modelProto, bool addBos = false, bool
91152

92153
public string UnknownToken { get; }
93154

94-
public int BeginningOfSentenceId { get; }
155+
public int BeginningOfSentenceId { get; set; }
95156

96-
public int EndOfSentenceId { get; }
157+
public int EndOfSentenceId { get; set; }
97158

98-
public int UnknownId { get; }
159+
public int UnknownId { get; set; }
99160

100161
public SentencePieceNormalizer? Normalizer { get; }
101162

src/Microsoft.ML.Tokenizers/Model/SentencePieceBpeModel.cs

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,52 @@ internal SentencePieceBpeModel(ModelProto modelProto, bool addBos, bool addEos,
4141
OneByteUtf8EncodingMaxId = ByteCodeToIdOffset + 0x7F; // 0x7F is the maximum value of the one byte UTF-8 character.
4242
}
4343

44+
internal SentencePieceBpeModel(SentencePieceOptions options) : base(options)
45+
{
46+
if (options.PrecompiledNormalizationData is not null)
47+
{
48+
throw new NotSupportedException("Normalization data is not supported for SentencePieceBpeModel.");
49+
}
50+
51+
Debug.Assert(options.Vocabulary is not null);
52+
53+
int id = 0;
54+
foreach (var item in options.Vocabulary!)
55+
{
56+
_vocab.Add(new StringSpanOrdinalKey< 3419 /span>(item.Token), (id, item.Score, (byte)ModelProto.Types.SentencePiece.Types.Type.Normal));
57+
_vocabReverse.Add(id++, item.Token);
58+
}
59+
60+
if (options.ByteFallback)
61+
{
62+
if (!_vocab.TryGetValue("<0x00>", out (int Id, float Score, byte Type) value))
63+
{
64+
throw F438 new ArgumentException("'ByteFallback' is enabled but the vocabulary must include a special token for each byte value (0-255) in the format <0xNN>, where NN represents the byte's hexadecimal value.");
65+
}
66+
67+
ByteCodeToIdOffset = value.Id;
68+
OneByteUtf8EncodingMaxId = ByteCodeToIdOffset + 0x7F; // 0x7F is the maximum value of the one byte UTF-8 character.
69+
}
70+
71+
if (!_vocab.TryGetValue(options.UnknownToken, out (int Id, float Score, byte Type) unknownToken))
72+
{
73+
throw new ArgumentException($"The vocabulary must include the unknown token '{options.UnknownToken}'.");
74+
}
75+
UnknownId = unknownToken.Id;
76+
77+
if (!_vocab.TryGetValue(options.BeginningOfSentenceToken, out (int Id, float Score, byte Type) beginOfSentenceToken))
78+
{
79+
throw new ArgumentException($"The vocabulary must include the beginning of sentence token '{options.BeginningOfSentenceToken}'.");
80+
}
81+
BeginningOfSentenceId = beginOfSentenceToken.Id;
82+
83+
if (!_vocab.TryGetValue(options.EndOfSentenceToken, out (int Id, float Score, byte Type) endOfSentenceToken))
84+
{
85+
throw new ArgumentException($"The vocabulary must include the end of sentence token '{options.EndOfSentenceToken}'.");
86+
}
87+
EndOfSentenceId = endOfSentenceToken.Id;
88+
}
89+
4490
public override IReadOnlyDictionary<string, int> Vocabulary
4591
{
4692
get
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
using System.Collections.Generic;
6+
7+
namespace Microsoft.ML.Tokenizers
8+
{
9+
#pragma warning disable MSML_NoInstanceInitializers
10+
/// <summary>
11+
/// The type of the SentencePiece model.
12+
/// </summary>
13+
public enum SentencePieceModelType
14+
{
15+
/// <summary>
16+
/// The model type is not defined.
17+
/// </summary>
18+
Undefined = 0,
19+
20+
/// <summary>
21+
/// The model type is Byte Pair Encoding (Bpe) model.
22+
/// </summary>
23+
Bpe = 1,
24+
25+
/// <summary>
26+
/// The model type is Unigram model.
27+
/// </summary>
28+
Unigram = 2,
29+
}
30+
31+
/// <summary>
32+
/// Options for the SentencePiece tokenizer.
33+
/// </summary>
34+
/// <remarks>
35+
/// The options are used to configure the SentencePiece tokenizer. Serialization is not guaranteed for this type.
36+
/// </remarks>
37+
public sealed class SentencePieceOptions
38+
{
39+
/// <summary>
40+
/// The type of the SentencePiece model.
41+
/// </summary>
42+
public SentencePieceModelType ModelType { get; set; }
43+
44+
/// <summary>
45+
/// Determines whether the model uses a byte fallback strategy to encode unknown tokens as byte sequences.
46+
/// </summary>
47+
/// <remarks>
48+
/// The vocabulary must include a special token for each byte value (0-255) in the format &lt;0xNN&gt;,
49+
/// where NN represents the byte's hexadecimal value (e.g., &lt;0x41&gt; for byte value 65).
50+
/// </remarks>
51+
public bool ByteFallback { get; set; }
52+
53+
/// <summary>
54+
/// Indicate emitting the prefix character e.g. U+2581 at the beginning of sentence token during the normalization and encoding.
55+
/// </summary>
56+
public bool AddDummyPrefix { get; set; }
57+
58+
/// <summary>
59+
/// Indicate if the spaces should be replaced with character U+2581 during the normalization and encoding. Default value is `true`.
60+
/// </summary>
61+
public bool EscapeWhiteSpaces { get; set; } = true;
62+
63+
/// <summary>
64+
/// Indicate emitting the character U+2581 at the end of the last sentence token instead beginning of sentence token during the normalization and encoding.
65+
/// </summary>
66+
public bool TreatWhitespaceAsSuffix { get; set; }
67+
68+
/// <summary>
69+
/// Indicate removing extra white spaces from the original string during the normalization.
70+
/// </summary>
71+
public bool RemoveExtraWhiteSpaces { get; set; }
72+
73+
/// <summary>
74+
/// Indicate emitting the beginning of sentence token during the encoding. Default value is `true`.
75+
/// </summary>
76+
public bool AddBeginningOfSentence { get; set; } = true;
77+
78+
/// <summary>
79+
/// Indicate emitting the end of sentence token during the encoding.
80+
/// </summary>
81+
public bool AddEndOfSentence { get; set; }
82+
83+
/// <summary>
84+
/// The beginning of sentence token. Default value is `&lt;s&gt;`.
85+
/// </summary>
86+
public string BeginningOfSentenceToken { get; set; } = "<s>";
87+
88+
/// <summary>
89+
/// The end of sentence token. Default value is `&lt;/s&gt;`.
90+
/// </summary>
91+
public string EndOfSentenceToken { get; set; } = "</s>";
92+
93+
/// <summary>
94+
/// The unknown token. Default value is `&lt;unk&gt;`.
95+
/// </summary>
96+
public string UnknownToken { get; set; } = < 57AE span class=pl-s>"<unk>";
97+
98+
/// <summary>
99+
/// The data used for string normalization.
100+
/// </summary>
101+
public byte[]? PrecompiledNormalizationData { get; set; }
102+
103+
/// <summary>
104+
/// Represent the vocabulary.
105+
/// The list should be sorted by token ID, with entries passed in the order that corresponds to their IDs. In other words,
106+
/// the first entry in the list will be mapped to ID 0, the second entry to ID 1, the third to ID 2, and so on.
107+
/// Each entry represents a token and its corresponding score.
108+
/// </summary>
109+
public IEnumerable<(string Token, float Score)>? Vocabulary { get; set; }
110+
111+
/// <summary>
112+
/// The special tokens.
113+
/// Special tokens remain intact during encoding and are not split into sub-tokens.
114+
/// </summary>
115+
public IReadOnlyDictionary<string, int>? SpecialTokens { get; set; }
116+
}
117+
#pragma warning restore MSML_NoInstanceInitializers
118+
}

src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,16 @@ internal SentencePieceTokenizer(ModelProto modelProto, bool addBos, bool addEos,
3030
};
3131
}
3232

33+
internal SentencePieceTokenizer(SentencePieceOptions options)
34+
{
35+
_model = options.ModelType switch
36+
{
37+
SentencePieceModelType.Bpe => new SentencePieceBpeModel(options),
38+
SentencePieceModelType.Unigram => new SentencePieceUnigramModel(options),
39+
_ => throw new ArgumentException($"The model type '{options.ModelType}' is not supported.", nameof(options.ModelType))
40+
};
41+
}
42+
3343
/// <summary>
3444
/// The special tokens.
3545
/// </summary>
@@ -457,5 +467,19 @@ public static SentencePieceTokenizer Create(
457467

458468
return new SentencePieceTokenizer(modelProto, addBeginOfSentence, addEndOfSentence, specialTokens);
459469
}
470+
471+
/// <summary>
472+
/// Creates an instance of SentencePieceTokenizer.
473+
/// </summary>
474+
/// <param name="options">The options to use for the sentence piece tokenizer.</param>
475+
public static SentencePieceTokenizer Create(SentencePieceOptions options)
476+
{
477+
if (options is null)
478+
{
479+
throw new ArgumentNullException(nameof(options));
480+
}
481+
482+
return new SentencePieceTokenizer(options);
483+
}
460484
}
461485
}

0 commit comments

Comments
 (0)
0