1
1
## About
2
2
3
- Microsoft.ML.Tokenizers supports various the implementation of the tokenization used in the NLP transforms .
3
+ Microsoft.ML.Tokenizers provides an abstraction for tokenizers as well as implementations of common tokenization algorithms .
4
4
5
5
## Key Features
6
6
@@ -15,28 +15,28 @@ Microsoft.ML.Tokenizers supports various the implementation of the tokenization
15
15
16
16
``` c#
17
17
using Microsoft .ML .Tokenizers ;
18
- using System .Net .Http ;
19
18
using System .IO ;
19
+ using System .Net .Http ;
20
20
21
21
//
22
22
// Using Tiktoken Tokenizer
23
23
//
24
24
25
- // initialize the tokenizer for `gpt-4 ` model
26
- Tokenizer tokenizer = TiktokenTokenizer .CreateForModel (" gpt-4 " );
25
+ // Initialize the tokenizer for the `gpt-4o ` model. This instance should be cached for all subsequent use.
26
+ Tokenizer tokenizer = TiktokenTokenizer .CreateForModel (" gpt-4o " );
27
27
28
28
string source = " Text tokenization is the process of splitting a string into a list of tokens." ;
29
29
30
30
Console .WriteLine ($" Tokens: {tokenizer .CountTokens (source )}" );
31
- // print : Tokens: 16
31
+ // prints : Tokens: 16
32
32
33
33
var trimIndex = tokenizer .GetIndexByTokenCountFromEnd (source , 5 , out string processedText , out _ );
34
34
Console .WriteLine ($" 5 tokens from end: {processedText .Substring (trimIndex )}" );
35
- // 5 tokens from end: a list of tokens.
35
+ // prints: 5 tokens from end: a list of tokens.
36
36
37
37
trimIndex = tokenizer .GetIndexByTokenCount (source , 5 , out processedText , out _ );
38
38
Console .WriteLine ($" 5 tokens from start: {processedText .Substring (0 , trimIndex )}" );
39
- // 5 tokens from start: Text tokenization is the
39
+ // prints: 5 tokens from start: Text tokenization is the
40
40
41
41
IReadOnlyList < int > ids = tokenizer .EncodeToIds (source );
42
42
Console .WriteLine (string .Join (" , " , ids ));
@@ -46,20 +46,21 @@ Console.WriteLine(string.Join(", ", ids));
46
46
// Using Llama Tokenizer
47
47
//
48
48
49
- // Open stream of remote Llama tokenizer model data file
49
+ // Open a stream to the remote Llama tokenizer model data file.
50
50
using HttpClient httpClient = new ();
51
51
const string modelUrl = @" https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model" ;
52
52
using Stream remoteStream = await httpClient .GetStreamAsync (modelUrl );
53
53
54
- // Create the Llama tokenizer using the remote stream
54
+ // Create the Llama tokenizer using the remote stream. This should be cached for all subsequent use.
55
55
Tokenizer llamaTokenizer = LlamaTokenizer .Create (remoteStream );
56
+
56
57
string input = " Hello, world!" ;
57
58
ids = llamaTokenizer .EncodeToIds (input );
58
59
Console .WriteLine (string .Join (" , " , ids ));
59
60
// prints: 1, 15043, 29892, 3186, 29991
60
61
61
62
Console .WriteLine ($" Tokens: {llamaTokenizer .CountTokens (input )}" );
62
- // print : Tokens: 5
63
+ // prints : Tokens: 5
63
64
```
64
65
65
66
## Main Types
0 commit comments