8000 Unigram tokenizer fixes (#7409) · dotnet/machinelearning@12ce84a · GitHub
[go: up one dir, main page]

Skip to content

Commit 12ce84a

Browse files
authored
Unigram tokenizer fixes (#7409)
* Unigram tokenizer fixes * Add a comment * Feedback
1 parent 142d7f5 commit 12ce84a

File tree

4 files changed

+46
-7
lines changed

4 files changed

+46
-7
lines changed

src/Microsoft.ML.Tokenizers/Model/SentencePieceBaseModel.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,11 @@ internal SentencePieceBaseModel(ModelProto modelProto, bool addBos = false, bool
2525
AddBeginningOfSentence = addBos;
2626
AddEndOfSentence = addEos;
2727
BeginningOfSentenceToken = modelProto.TrainerSpec.BosPiece ?? "<s>";
28-
BeginningOfSentenceId = modelProto.TrainerSpec.BosId <= 0 ? 1 : modelProto.TrainerSpec.BosId;
28+
BeginningOfSentenceId = Math.Max(0, modelProto.TrainerSpec.BosId);
2929
EndOfSentenceToken = modelProto.TrainerSpec.EosPiece ?? "</s>";
30-
EndOfSentenceId = modelProto.TrainerSpec.EosId <= 0 ? 1 : modelProto.TrainerSpec.EosId;
30+
EndOfSentenceId = Math.Max(0, modelProto.TrainerSpec.EosId);
3131
UnknownToken = modelProto.TrainerSpec.UnkPiece ?? "<unk>";
32-
UnknownId = modelProto.TrainerSpec.UnkId < 0 ? 0 : modelProto.TrainerSpec.UnkId;
32+
UnknownId = Math.Max(0, modelProto.TrainerSpec.UnkId);
3333
AddDummyPrefix = modelProto.NormalizerSpec.AddDummyPrefix;
3434
EscapeWhiteSpaces = modelProto.NormalizerSpec.EscapeWhitespaces;
3535
TreatWhitespaceAsSuffix = modelProto.TrainerSpec.TreatWhitespaceAsSuffix;

src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,27 @@ public SentencePieceUnigramModel(ModelProto modelProto, bool addBos, bool addEos
6262

6363
_trie = new DoubleArrayTrie(_vocab);
6464

65-
_vocabReverse[BeginningOfSentenceId] = (BeginningOfSentenceToken, 0f, 0);
66-
_vocabReverse[EndOfSentenceId] = (EndOfSentenceToken, 0f, 0);
65+
// Once the trie is built, we need to add the special tokens to the vocabulary.
66+
// Including these special tokens ensures they are mapped like regular tokens.
67+
// SentencePiece specifically handles the BOS, EOS, and UNK tokens, while the PAD token is optional.
68+
69+
Debug.Assert(modelProto.TrainerSpec.UnkId >= 0);
70+
Debug.Assert(modelProto.TrainerSpec.BosId >= 0);
71+
Debug.Assert(modelProto.TrainerSpec.EosId >= 0);
72+
73+
_vocab[modelProto.TrainerSpec.UnkPiece] = modelProto.TrainerSpec.UnkId;
74+
_vocab[modelProto.TrainerSpec.BosPiece] = modelProto.TrainerSpec.BosId;
75+
_vocab[modelProto.TrainerSpec.EosPiece] = modelProto.TrainerSpec.EosId;
76+
77+
_vocabReverse[modelProto.TrainerSpec.BosId] = (modelProto.TrainerSpec.BosPiece, 0f, ModelProto.Types.SentencePiece.Types.Type.Control);
78+
_vocabReverse[modelProto.TrainerSpec.EosId] = (modelProto.TrainerSpec.EosPiece, 0f, ModelProto.Types.SentencePiece.Types.Type.Control);
79+
_vocabReverse[modelProto.TrainerSpec.UnkId] = (modelProto.TrainerSpec.UnkPiece, 0f, ModelProto.Types.SentencePiece.Types.Type.Unknown);
80+
81+
if (modelProto.TrainerSpec.PadId >= 0)
82+
{
83+
_vocab[modelProto.TrainerSpec.PadPiece] = modelProto.TrainerSpec.PadId;
84+
_vocabReverse[modelProto.TrainerSpec.PadId] = (modelProto.TrainerSpec.PadPiece, 0f, ModelProto.Types.SentencePiece.Types.Type.Control);
85+
}
6786
}
6887

6988
public SentencePieceUnigramModel(SentencePieceOptions options) : base(options)

src/Microsoft.ML.Tokenizers/Normalizer/SentencePieceNormalizer.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,7 @@ internal int Normalize(ReadOnlySpan<byte> input, ref Span<byte> normalized, ref
355355
break;
356356
}
357357

358-
ReadOnlySpan<byte> normalizedByte = normalizedPrefix.Equals(default(Memory<byte>)) ? input.Slice(0, p) : normalizedPrefix.Span;
358+
ReadOnlySpan<byte> normalizedByte = normalizedPrefix.Length == 0 ? input.Slice(0, p) : normalizedPrefix.Span;
359359
if (normalizedByte[0] != (byte)' ')
360360
{
361361
break;
@@ -386,7 +386,7 @@ internal int Normalize(ReadOnlySpan<byte> input, ref Span<byte> normalized, ref
386386
while (!input.IsEmpty)
387387
{
388388
int p = NormalizePrefix(input, out Memory<byte> normalizedPrefix);
389-
ReadOnlySpan<byte> sp = normalizedPrefix.Equals(default(Memory<byte>)) ? input.Slice(0, p) : normalizedPrefix.Span;
389+
ReadOnlySpan<byte> sp = normalizedPrefix.Length == 0 ? input.Slice(0, p) : normalizedPrefix.Span;
390390

391391
// Removes heading spaces in sentence piece, if the previous sentence piece ends with whitespace.
392392
while (isPrevSpace && sp.Length > 0 && sp[0] == (byte)' ')

test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,26 @@ public static IEnumerable<object[]> UnigramTestData()
147147
new Range[0]
148148
};
149149

150+
yield return new object[]
151+
{
152+
"\u001f", // string start with control character
153+
"▁\u001f",
154+
"",
155+
new int[] { 5, 0 },
156+
new string[] { "▁", "\u001f"},
157+
new Range[] { new Range(0, 1), new Range(1, 2) }
158+
};
159+
160+
yield return new object[]
161+
{
162+
"\ufe7b", // Decompose to two letters
163+
"▁\u0640\u0650",
164+
"\u0640\u0650",
165+
new int[] { 17637, 487 },
166+
new string[] { "▁\u0640", "\u0650" },
167+
new Range[] { new Range(0, 2), new Range(2, 3) }
168+
};
169+
150170
yield return new object[]
151171
{
152172
@"The sun dipped below the horizon, casting a warm golden hue across the tranquil meadow. Birds fluttered from " +

0 commit comments

Comments
 (0)
0