8000 Final tokenizer's cleanup (#7291) · dotnet/machinelearning@8611211 · GitHub
[go: up one dir, main page]

Skip to conten 8000 t

Commit 8611211

Browse files
authored
Final tokenizer's cleanup (#7291)
1 parent 3659a48 commit 8611211

26 files changed

+1028
-918
lines changed

src/Microsoft.ML.Tokenizers/Model/BPETokenizer.cs

Lines changed: 59 additions & 44 deletions
Large diffs are not rendered by default.
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
namespace Microsoft.ML.Tokenizers
6+
{
7+
/// <summary>
8+
/// Options for the Bert tokenizer.
9+
/// </summary>
10+
public sealed class BertOptions : WordPieceOptions
11+
{
12+
#pragma warning disable MSML_NoInstanceInitializers
13+
/// <summary>
14+
/// Gets or sets a value indicating whether to lower case the input before tokenization.
15+
/// </summary>
16+
public bool LowerCaseBeforeTokenization { get; set; } = true;
17+
18+
/// <summary>
19+
/// Gets or sets a value indicating whether to apply basic tokenization.
20+
/// </summary>
21+
public bool ApplyBasicTokenization { get; set; } = true;
22+
23+
/// <summary>
24+
/// Gets or sets a value indicating whether to split on special tokens.
25+
/// </summary>
26+
public bool SplitOnSpecialTokens { get; set; } = true;
27+
28+
/// <summary>
29+
/// Gets or sets the separator token to use.
30+
/// </summary>
31+
public string SeparatorToken { get; set; } = "[SEP]";
32+
33+
/// <summary>
34+
/// Gets or sets the padding token to use.
35+
/// </summary>
36+
public string PaddingToken { get; set; } = "[PAD]";
37+
38+
/// <summary>
39+
/// Gets or sets the classification token to use.
40+
/// </summary>
41+
public string ClassificationToken { get; set; } = "[CLS]";
42+
43+
/// <summary>
44+
/// Gets or sets the masking token to use.
45+
/// </summary>
46+
public string MaskingToken { get; set; } = "[MASK]";
47+
48+
/// <summary>
49+
/// Gets or sets a value indicating whether to tokenize the CJK characters in separate tokens.
50+
/// </summary>
51+
/// <remarks>
52+
/// This is useful when you want to tokenize CJK characters individually.
53+
/// The following Unicode ranges are considered CJK characters for this purpose:
54+
/// - U+3400 - U+4DBF CJK Unified Ideographs Extension A.
55+
/// - U+4E00 - U+9FFF basic set of CJK characters.
56+
/// - U+F900 - U+FAFF CJK Compatibility Ideographs.
57+
/// - U+20000 - U+2A6DF CJK Unified Ideographs Extension B.
58+
/// - U+2A700 - U+2B73F CJK Unified Ideographs Extension C.
59+
/// - U+2B740 - U+2B81F CJK Unified Ideographs Extension D.
60+
/// - U+2B820 - U+2CEAF CJK Unified Ideographs Extension E.
61+
/// - U+2F800 - U+2FA1F CJK Compatibility Ideographs Supplement.
62+
/// </remarks>
63+
public bool IndividuallyTokenizeCjk { get; set; } = true;
64+
65+
/// <summary>
66+
/// Gets or sets a value indicating whether to remove non-spacing marks.
67+
/// </summary>
68+
public bool RemoveNonSpacingMarks { get; set; }
69+
70+
#pragma warning restore MSML_NoInstanceInitializers
71+
}
72+
}

0 commit comments

Comments
 (0)
0