1
+ // Licensed to the .NET Foundation under one or more agreements.
2
+ // The .NET Foundation licenses this file to you under the MIT license.
3
+ // See the LICENSE file in the project root for more information.
4
+
5
+ namespace Microsoft . ML . Tokenizers
6
+ {
7
+ /// <summary>
8
+ /// Options for the Bert tokenizer.
9
+ /// </summary>
10
+ public sealed class BertOptions : WordPieceOptions
11
+ {
12
+ #pragma warning disable MSML_NoInstanceInitializers
13
+ /// <summary>
14
+ /// Gets or sets a value indicating whether to lower case the input before tokenization.
15
+ /// </summary>
16
+ public bool LowerCaseBeforeTokenization { get ; set ; } = true ;
17
+
18
+ /// <summary>
19
+ /// Gets or sets a value indicating whether to apply basic tokenization.
20
+ /// </summary>
21
+ public bool ApplyBasicTokenization { get ; set ; } = true ;
22
+
23
+ /// <summary>
24
+ /// Gets or sets a value indicating whether to split on special tokens.
25
+ /// </summary>
26
+ public bool SplitOnSpecialTokens { get ; set ; } = true ;
27
+
28
+ /// <summary>
29
+ /// Gets or sets the separator token to use.
30
+ /// </summary>
31
+ public string SeparatorToken { get ; set ; } = "[SEP]" ;
32
+
33
+ /// <summary>
34
+ /// Gets or sets the padding token to use.
35
+ /// </summary>
36
+ public string PaddingToken { get ; set ; } = "[PAD]" ;
37
+
38
+ /// <summary>
39
+ /// Gets or sets the classification token to use.
40
+ /// </summary>
41
+ public string ClassificationToken { get ; set ; } = "[CLS]" ;
42
+
43
+ /// <summary>
44
+ /// Gets or sets the masking token to use.
45
+ /// </summary>
46
+ public string MaskingToken { get ; set ; } = "[MASK]" ;
47
+
48
+ /// <summary>
49
+ /// Gets or sets a value indicating whether to tokenize the CJK characters in separate tokens.
50
+ /// </summary>
51
+ /// <remarks>
52
+ /// This is useful when you want to tokenize CJK characters individually.
53
+ /// The following Unicode ranges are considered CJK characters for this purpose:
54
+ /// - U+3400 - U+4DBF CJK Unified Ideographs Extension A.
55
+ /// - U+4E00 - U+9FFF basic set of CJK characters.
56
+ /// - U+F900 - U+FAFF CJK Compatibility Ideographs.
57
+ /// - U+20000 - U+2A6DF CJK Unified Ideographs Extension B.
58
+ /// - U+2A700 - U+2B73F CJK Unified Ideographs Extension C.
59
+ /// - U+2B740 - U+2B81F CJK Unified Ideographs Extension D.
60
+ /// - U+2B820 - U+2CEAF CJK Unified Ideographs Extension E.
61
+ /// - U+2F800 - U+2FA1F CJK Compatibility Ideographs Supplement.
62
+ /// </remarks>
63
+ public bool IndividuallyTokenizeCjk { get ; set ; } = true ;
64
+
65
+ /// <summary>
66
+ /// Gets or sets a value indicating whether to remove non-spacing marks.
67
+ /// </summary>
68
+ public bool RemoveNonSpacingMarks { get ; set ; }
69
+
70
+ #pragma warning restore MSML_NoInstanceInitializers
71
+ }
72
+ }
0 commit comments