8000 Some tweaks to the Microsoft.ML.Tokenizers PACKAGE.md (#7360) · dotnet/machinelearning@86bdd73 · GitHub
[go: up one dir, main page]

Skip to content

Commit 86bdd73

Browse files
authored
Some tweaks to the Microsoft.ML.Tokenizers PACKAGE.md (#7360)
1 parent 886e2ff commit 86bdd73

File tree

1 file changed

+11
-10
lines changed

1 file changed

+11
-10
lines changed

src/Microsoft.ML.Tokenizers/PACKAGE.md

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
## About
22

3-
Microsoft.ML.Tokenizers supports various the implementation of the tokenization used in the NLP transforms.
3+
Microsoft.ML.Tokenizers provides an abstraction for tokenizers as well as implementations of common tokenization algorithms.
44

55
## Key Features
66

@@ -15,28 +15,28 @@ Microsoft.ML.Tokenizers supports various the implementation of the tokenization
1515

1616
```c#
1717
using Microsoft.ML.Tokenizers;
18-
using System.Net.Http;
1918
using System.IO;
19+
using System.Net.Http;
2020

2121
//
2222
// Using Tiktoken Tokenizer
2323
//
2424
25-
// initialize the tokenizer for `gpt-4` model
26-
Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4");
25+
// Initialize the tokenizer for the `gpt-4o` model. This instance should be cached for all subsequent use.
26+
Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
2727

2828
string source = "Text tokenization is the process of splitting a string into a list of tokens.";
2929

3030
Console.WriteLine($"Tokens: {tokenizer.CountTokens(source)}");
31-
// print: Tokens: 16
31+
// prints: Tokens: 16
3232
3333
var trimIndex = tokenizer.GetIndexByTokenCountFromEnd(source, 5, out string processedText, out _);
3434
Console.WriteLine($"5 tokens from end: {processedText.Substring(trimIndex)}");
35-
// 5 tokens from end: a list of tokens.
35+
// prints: 5 tokens from end: a list of tokens.
3636
3737
trimIndex = tokenizer.GetIndexByTokenCount(source, 5, out processedText, out _);
3838
Console.WriteLine($"5 tokens from start: {processedText.Substring(0, trimIndex)}");
39-
// 5 tokens from start: Text tokenization is the
39+
// prints: 5 tokens from start: Text tokenization is the
4040
4141
IReadOnlyList<int> ids = tokenizer.EncodeToIds(source);
4242
Console.WriteLine(string.Join(", ", ids));
@@ -46,20 +46,21 @@ Console.WriteLine(string.Join(", ", ids));
4646
// Using Llama Tokenizer
4747
//
4848
49-
// Open stream of remote Llama tokenizer model data file
49+
// Open a stream to the remote Llama tokenizer model data file.
5050
using HttpClient httpClient = new();
5151
const string modelUrl = @"https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model";
5252
using Stream remoteStream = await httpClient.GetStreamAsync(modelUrl);
5353

54-
// Create the Llama tokenizer using the remote stream
54+
// Create the Llama tokenizer using the remote stream. This should be cached for all subsequent use.
5555
Tokenizer llamaTokenizer = LlamaTokenizer.Create(remoteStream);
56+
5657
string input = "Hello, world!";
5758
ids = llamaTokenizer.EncodeToIds(input);
5859
Console.WriteLine(string.Join(", ", ids));
5960
// prints: 1, 15043, 29892, 3186, 29991
6061
6162
Console.WriteLine($"Tokens: {llamaTokenizer.CountTokens(input)}");
62-
// print: Tokens: 5
63+
// prints: Tokens: 5
6364
```
6465

6566
## Main Types

0 commit comments

Comments
 (0)
0