diff --git a/README.md b/README.md
index 86f9298..293b02a 100644
--- a/README.md
+++ b/README.md
@@ -33,45 +33,51 @@ You can view the reports for each version [here](benchmarks)
```
-BenchmarkDotNet v0.13.12, macOS Sonoma 14.4.1 (23E224) [Darwin 23.4.0]
+BenchmarkDotNet v0.14.0, macOS Sequoia 15.1 (24B83) [Darwin 24.1.0]
Apple M1 Pro, 1 CPU, 10 logical and 10 physical cores
-.NET SDK 8.0.204
- [Host] : .NET 8.0.4 (8.0.424.16909), Arm64 RyuJIT AdvSIMD
- DefaultJob : .NET 8.0.4 (8.0.424.16909), Arm64 RyuJIT AdvSIMD
+.NET SDK 9.0.100
+ [Host] : .NET 9.0.0 (9.0.24.52809), Arm64 RyuJIT AdvSIMD
+ DefaultJob : .NET 9.0.0 (9.0.24.52809), Arm64 RyuJIT AdvSIMD
```
-| Method | Categories | Data | Mean | Median | Ratio | Gen0 | Gen1 | Gen2 | Allocated | Alloc Ratio |
-|--------------------------- |------------ |-------------------- |-------------:|-------------:|------:|---------:|--------:|-------:|----------:|------------:|
-| **SharpTokenV2_0_1_** | **CountTokens** | **1. (...)57. [19866]** | **632,817.1 ns** | **632,257.2 ns** | **1.00** | **2.9297** | **-** | **-** | **20115 B** | **1.00** |
-| TiktokenSharpV1_0_9_ | CountTokens | 1. (...)57. [19866] | 463,840.3 ns | 458,851.3 ns | 0.74 | 64.4531 | 3.4180 | - | 404649 B | 20.12 |
-| TokenizerLibV1_3_3_ | CountTokens | 1. (...)57. [19866] | 801,796.0 ns | 806,271.8 ns | 1.27 | 247.0703 | 98.6328 | 0.9766 | 1547675 B | 76.94 |
-| Tiktoken_ | CountTokens | 1. (...)57. [19866] | 319,697.2 ns | 319,475.1 ns | 0.50 | 49.3164 | - | - | 309449 B | 15.38 |
-| | | | | | | | | | | |
-| **SharpTokenV2_0_1_** | **CountTokens** | **Hello, World!** | **478.1 ns** | **478.1 ns** | **1.00** | **0.0401** | **-** | **-** | **256 B** | **1.00** |
-| TiktokenSharpV1_0_9_ | CountTokens | Hello, World! | 275.2 ns | 275.1 ns | 0.58 | 0.0505 | - | - | 320 B | 1.25 |
-| TokenizerLibV1_3_3_ | CountTokens | Hello, World! | 498.1 ns | 497.4 ns | 1.04 | 0.2356 | - | - | 1480 B | 5.78 |
-| Tiktoken_ | CountTokens | Hello, World! | 212.9 ns | 212.8 ns | 0.45 | 0.0420 | - | - | 264 B | 1.03 |
-| | | | | | | | | | | |
-| **SharpTokenV2_0_1_** | **CountTokens** | **King(...)edy. [275]** | **6,652.5 ns** | **6,651.9 ns** | **1.00** | **0.0763** | **-** | **-** | **520 B** | **1.00** |
-| TiktokenSharpV1_0_9_ | CountTokens | King(...)edy. [275] | 4,774.2 ns | 4,781.1 ns | 0.72 | 0.8011 | - | - | 5064 B | 9.74 |
-| TokenizerLibV1_3_3_ | CountTokens | King(...)edy. [275] | 7,261.6 ns | 7,241.6 ns | 1.09 | 3.0899 | 0.1450 | 0.0076 | 19344 B | 37.20 |
-| Tiktoken_ | CountTokens | King(...)edy. [275] | 3,216.1 ns | 3,189.9 ns | 0.49 | 0.6447 | - | - | 4064 B | 7.82 |
-| | | | | | | | | | | |
-| **SharpTokenV2_0_1_Encode** | **Encode** | **1. (...)57. [19866]** | **613,700.9 ns** | **612,821.4 ns** | **1.00** | **2.9297** | **-** | **-** | **20115 B** | **1.00** |
-| TiktokenSharpV1_0_9_Encode | Encode | 1. (...)57. [19866] | 444,436.3 ns | 444,298.4 ns | 0.72 | 64.4531 | 3.4180 | - | 404649 B | 20.12 |
-| TokenizerLibV1_3_3_Encode | Encode | 1. (...)57. [19866] | 773,882.5 ns | 774,314.3 ns | 1.26 | 246.0938 | 85.9375 | - | 1547673 B | 76.94 |
-| Tiktoken_Encode | Encode | 1. (...)57. [19866] | 335,482.3 ns | 333,936.4 ns | 0.55 | 59.5703 | 2.4414 | - | 375601 B | 18.67 |
-| | | | | | | | | | | |
-| **SharpTokenV2_0_1_Encode** | **Encode** | **Hello, World!** | **443.7 ns** | **436.8 ns** | **1.00** | **0.0405** | **-** | **-** | **256 B** | **1.00** |
-| TiktokenSharpV1_0_9_Encode | Encode | Hello, World! | 300.4 ns | 299.4 ns | 0.67 | 0.0505 | - | - | 320 B | 1.25 |
-| TokenizerLibV1_3_3_Encode | Encode | Hello, World! | 504.7 ns | 498.5 ns | 1.15 | 0.2356 | 0.0010 | - | 1480 B | 5.78 |
-| Tiktoken_Encode | Encode | Hello, World! | 262.4 ns | 262.6 ns | 0.58 | 0.1030 | - | - | 648 B | 2.53 |
-| | | | | | | | | | | |
-| **SharpTokenV2_0_1_Encode** | **Encode** | **King(...)edy. [275]** | **6,784.3 ns** | **6,714.1 ns** | **1.00** | **0.0763** | **-** | **-** | **520 B** | **1.00** |
-| TiktokenSharpV1_0_9_Encode | Encode | King(...)edy. [275] | 4,691.2 ns | 4,690.7 ns | 0.69 | 0.8011 | - | - | 5064 B | 9.74 |
-| TokenizerLibV1_3_3_Encode | Encode | King(...)edy. [275] | 7,287.9 ns | 7,290.9 ns | 1.08 | 3.0823 | 0.1373 | - | 19344 B | 37.20 |
-| Tiktoken_Encode | Encode | King(...)edy. [275] | 3,606.2 ns | 3,607.4 ns | 0.53 | 0.7973 | - | - | 5024 B | 9.66 |
+| Method | Categories | Data | Mean | Ratio | Gen0 | Gen1 | Allocated | Alloc Ratio |
+|---------------------------------- |------------ |-------------------- |-------------:|------:|---------:|--------:|----------:|------------:|
+| **SharpTokenV2_0_3_** | **CountTokens** | **1. (...)57. [19866]** | **567,130.0 ns** | **1.00** | **2.9297** | **-** | **20115 B** | **1.00** |
+| TiktokenSharpV1_1_5_ | CountTokens | 1. (...)57. [19866] | 483,976.7 ns | 0.85 | 64.4531 | 5.8594 | 404648 B | 20.12 |
+| MicrosoftMLTokenizerV1_0_0_ | CountTokens | 1. (...)57. [19866] | 427,733.2 ns | 0.75 | - | - | 297 B | 0.01 |
+| TokenizerLibV1_3_3_ | CountTokens | 1. (...)57. [19866] | 773,467.5 ns | 1.36 | 246.0938 | 83.9844 | 1547675 B | 76.94 |
+| Tiktoken_ | CountTokens | 1. (...)57. [19866] | 271,564.3 ns | 0.48 | 23.4375 | - | 148313 B | 7.37 |
+| | | | | | | | | |
+| **SharpTokenV2_0_3_** | **CountTokens** | **Hello, World!** | **380.0 ns** | **1.00** | **0.0405** | **-** | **256 B** | **1.00** |
+| TiktokenSharpV1_1_5_ | CountTokens | Hello, World! | 263.8 ns | 0.69 | 0.0505 | - | 320 B | 1.25 |
+| MicrosoftMLTokenizerV1_0_0_ | CountTokens | Hello, World! | 305.7 ns | 0.80 | 0.0153 | - | 96 B | 0.38 |
+| TokenizerLibV1_3_3_ | CountTokens | Hello, World! | 509.6 ns | 1.34 | 0.2356 | 0.0010 | 1480 B | 5.78 |
+| Tiktoken_ | CountTokens | Hello, World! | 175.7 ns | 0.46 | 0.0191 | - | 120 B | 0.47 |
+| | | | | | | | | |
+| **SharpTokenV2_0_3_** | **CountTokens** | **King(...)edy. [275]** | **5,990.7 ns** | **1.00** | **0.0763** | **-** | **520 B** | **1.00** |
+| TiktokenSharpV1_1_5_ | CountTokens | King(...)edy. [275] | 4,516.5 ns | 0.75 | 0.8011 | - | 5064 B | 9.74 |
+| MicrosoftMLTokenizerV1_0_0_ | CountTokens | King(...)edy. [275] | 3,871.2 ns | 0.65 | 0.0153 | - | 96 B | 0.18 |
+| TokenizerLibV1_3_3_ | CountTokens | King(...)edy. [275] | 7,465.8 ns | 1.25 | 3.0823 | 0.1373 | 19344 B | 37.20 |
+| Tiktoken_ | CountTokens | King(...)edy. [275] | 2,744.5 ns | 0.46 | 0.3128 | - | 1976 B | 3.80 |
+| | | | | | | | | |
+| **SharpTokenV2_0_3_Encode** | **Encode** | **1. (...)57. [19866]** | **568,150.3 ns** | **1.00** | **2.9297** | **-** | **20115 B** | **1.00** |
+| TiktokenSharpV1_1_5_Encode | Encode | 1. (...)57. [19866] | 444,972.1 ns | 0.78 | 64.4531 | 5.8594 | 404649 B | 20.12 |
+| MicrosoftMLTokenizerV1_0_0_Encode | Encode | 1. (...)57. [19866] | 410,970.9 ns | 0.72 | 10.2539 | 0.4883 | 66137 B | 3.29 |
+| TokenizerLibV1_3_3_Encode | Encode | 1. (...)57. [19866] | 770,068.9 ns | 1.36 | 246.0938 | 90.8203 | 1547675 B | 76.94 |
+| Tiktoken_Encode | Encode | 1. (...)57. [19866] | 290,030.9 ns | 0.51 | 33.6914 | 1.4648 | 214465 B | 10.66 |
+| | | | | | | | | |
+| **SharpTokenV2_0_3_Encode** | **Encode** | **Hello, World!** | **381.2 ns** | **1.00** | **0.0405** | **-** | **256 B** | **1.00** |
+| TiktokenSharpV1_1_5_Encode | Encode | Hello, World! | 260.2 ns | 0.68 | 0.0505 | - | 320 B | 1.25 |
+| MicrosoftMLTokenizerV1_0_0_Encode | Encode | Hello, World! | 325.1 ns | 0.85 | 0.0267 | - | 168 B | 0.66 |
+| TokenizerLibV1_3_3_Encode | Encode | Hello, World! | 511.6 ns | 1.34 | 0.2356 | - | 1480 B | 5.78 |
+| Tiktoken_Encode | Encode | Hello, World! | 241.4 ns | 0.63 | 0.0801 | - | 504 B | 1.97 |
+| | | | | | | | | |
+| **SharpTokenV2_0_3_Encode** | **Encode** | **King(...)edy. [275]** | **5,957.3 ns** | **1.00** | **0.0763** | **-** | **520 B** | **1.00** |
+| TiktokenSharpV1_1_5_Encode | Encode | King(...)edy. [275] | 4,523.8 ns | 0.76 | 0.8011 | - | 5064 B | 9.74 |
+| MicrosoftMLTokenizerV1_0_0_Encode | Encode | King(...)edy. [275] | 4,069.8 ns | 0.68 | 0.1144 | - | 744 B | 1.43 |
+| TokenizerLibV1_3_3_Encode | Encode | King(...)edy. [275] | 7,207.8 ns | 1.21 | 3.0823 | 0.1373 | 19344 B | 37.20 |
+| Tiktoken_Encode | Encode | King(...)edy. [275] | 2,945.7 ns | 0.49 | 0.4654 | - | 2936 B | 5.65 |
diff --git a/benchmarks/2.2.0.0_encode.md b/benchmarks/2.2.0.0_encode.md
new file mode 100644
index 0000000..02534dd
--- /dev/null
+++ b/benchmarks/2.2.0.0_encode.md
@@ -0,0 +1,47 @@
+```
+
+BenchmarkDotNet v0.14.0, macOS Sequoia 15.1 (24B83) [Darwin 24.1.0]
+Apple M1 Pro, 1 CPU, 10 logical and 10 physical cores
+.NET SDK 9.0.100
+ [Host] : .NET 9.0.0 (9.0.24.52809), Arm64 RyuJIT AdvSIMD
+ DefaultJob : .NET 9.0.0 (9.0.24.52809), Arm64 RyuJIT AdvSIMD
+
+
+```
+| Method | Categories | Data | Mean | Ratio | Gen0 | Gen1 | Allocated | Alloc Ratio |
+|---------------------------------- |------------ |-------------------- |-------------:|------:|---------:|--------:|----------:|------------:|
+| **SharpTokenV2_0_3_** | **CountTokens** | **1. (...)57. [19866]** | **567,130.0 ns** | **1.00** | **2.9297** | **-** | **20115 B** | **1.00** |
+| TiktokenSharpV1_1_5_ | CountTokens | 1. (...)57. [19866] | 483,976.7 ns | 0.85 | 64.4531 | 5.8594 | 404648 B | 20.12 |
+| MicrosoftMLTokenizerV1_0_0_ | CountTokens | 1. (...)57. [19866] | 427,733.2 ns | 0.75 | - | - | 297 B | 0.01 |
+| TokenizerLibV1_3_3_ | CountTokens | 1. (...)57. [19866] | 773,467.5 ns | 1.36 | 246.0938 | 83.9844 | 1547675 B | 76.94 |
+| Tiktoken_ | CountTokens | 1. (...)57. [19866] | 271,564.3 ns | 0.48 | 23.4375 | - | 148313 B | 7.37 |
+| | | | | | | | | |
+| **SharpTokenV2_0_3_** | **CountTokens** | **Hello, World!** | **380.0 ns** | **1.00** | **0.0405** | **-** | **256 B** | **1.00** |
+| TiktokenSharpV1_1_5_ | CountTokens | Hello, World! | 263.8 ns | 0.69 | 0.0505 | - | 320 B | 1.25 |
+| MicrosoftMLTokenizerV1_0_0_ | CountTokens | Hello, World! | 305.7 ns | 0.80 | 0.0153 | - | 96 B | 0.38 |
+| TokenizerLibV1_3_3_ | CountTokens | Hello, World! | 509.6 ns | 1.34 | 0.2356 | 0.0010 | 1480 B | 5.78 |
+| Tiktoken_ | CountTokens | Hello, World! | 175.7 ns | 0.46 | 0.0191 | - | 120 B | 0.47 |
+| | | | | | | | | |
+| **SharpTokenV2_0_3_** | **CountTokens** | **King(...)edy. [275]** | **5,990.7 ns** | **1.00** | **0.0763** | **-** | **520 B** | **1.00** |
+| TiktokenSharpV1_1_5_ | CountTokens | King(...)edy. [275] | 4,516.5 ns | 0.75 | 0.8011 | - | 5064 B | 9.74 |
+| MicrosoftMLTokenizerV1_0_0_ | CountTokens | King(...)edy. [275] | 3,871.2 ns | 0.65 | 0.0153 | - | 96 B | 0.18 |
+| TokenizerLibV1_3_3_ | CountTokens | King(...)edy. [275] | 7,465.8 ns | 1.25 | 3.0823 | 0.1373 | 19344 B | 37.20 |
+| Tiktoken_ | CountTokens | King(...)edy. [275] | 2,744.5 ns | 0.46 | 0.3128 | - | 1976 B | 3.80 |
+| | | | | | | | | |
+| **SharpTokenV2_0_3_Encode** | **Encode** | **1. (...)57. [19866]** | **568,150.3 ns** | **1.00** | **2.9297** | **-** | **20115 B** | **1.00** |
+| TiktokenSharpV1_1_5_Encode | Encode | 1. (...)57. [19866] | 444,972.1 ns | 0.78 | 64.4531 | 5.8594 | 404649 B | 20.12 |
+| MicrosoftMLTokenizerV1_0_0_Encode | Encode | 1. (...)57. [19866] | 410,970.9 ns | 0.72 | 10.2539 | 0.4883 | 66137 B | 3.29 |
+| TokenizerLibV1_3_3_Encode | Encode | 1. (...)57. [19866] | 770,068.9 ns | 1.36 | 246.0938 | 90.8203 | 1547675 B | 76.94 |
+| Tiktoken_Encode | Encode | 1. (...)57. [19866] | 290,030.9 ns | 0.51 | 33.6914 | 1.4648 | 214465 B | 10.66 |
+| | | | | | | | | |
+| **SharpTokenV2_0_3_Encode** | **Encode** | **Hello, World!** | **381.2 ns** | **1.00** | **0.0405** | **-** | **256 B** | **1.00** |
+| TiktokenSharpV1_1_5_Encode | Encode | Hello, World! | 260.2 ns | 0.68 | 0.0505 | - | 320 B | 1.25 |
+| MicrosoftMLTokenizerV1_0_0_Encode | Encode | Hello, World! | 325.1 ns | 0.85 | 0.0267 | - | 168 B | 0.66 |
+| TokenizerLibV1_3_3_Encode | Encode | Hello, World! | 511.6 ns | 1.34 | 0.2356 | - | 1480 B | 5.78 |
+| Tiktoken_Encode | Encode | Hello, World! | 241.4 ns | 0.63 | 0.0801 | - | 504 B | 1.97 |
+| | | | | | | | | |
+| **SharpTokenV2_0_3_Encode** | **Encode** | **King(...)edy. [275]** | **5,957.3 ns** | **1.00** | **0.0763** | **-** | **520 B** | **1.00** |
+| TiktokenSharpV1_1_5_Encode | Encode | King(...)edy. [275] | 4,523.8 ns | 0.76 | 0.8011 | - | 5064 B | 9.74 |
+| MicrosoftMLTokenizerV1_0_0_Encode | Encode | King(...)edy. [275] | 4,069.8 ns | 0.68 | 0.1144 | - | 744 B | 1.43 |
+| TokenizerLibV1_3_3_Encode | Encode | King(...)edy. [275] | 7,207.8 ns | 1.21 | 3.0823 | 0.1373 | 19344 B | 37.20 |
+| Tiktoken_Encode | Encode | King(...)edy. [275] | 2,945.7 ns | 0.49 | 0.4654 | - | 2936 B | 5.65 |
diff --git a/src/Directory.Packages.props b/src/Directory.Packages.props
index 570b442..3d44e7c 100644
--- a/src/Directory.Packages.props
+++ b/src/Directory.Packages.props
@@ -12,6 +12,8 @@
runtime; build; native; contentfiles; analyzers; buildtransitive
+
+
all
diff --git a/src/benchmarks/Tiktoken.Benchmarks/Benchmarks.cs b/src/benchmarks/Tiktoken.Benchmarks/Benchmarks.cs
index f917f37..60c50a2 100644
--- a/src/benchmarks/Tiktoken.Benchmarks/Benchmarks.cs
+++ b/src/benchmarks/Tiktoken.Benchmarks/Benchmarks.cs
@@ -1,6 +1,7 @@
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Configs;
using Microsoft.DeepDev;
+using Microsoft.ML.Tokenizers;
using SharpToken;
using Tiktoken.Encodings;
using TiktokenSharp;
@@ -18,6 +19,7 @@ public class Benchmarks
private readonly GptEncoding _sharpToken = GptEncoding.GetEncoding("cl100k_base");
private readonly TikToken _tiktokenSharp = TikToken.GetEncoding("cl100k_base");
private readonly Encoder _tiktoken = new(new Cl100KBase());
+ private readonly Tokenizer _microsoftMlTiktoken = TiktokenTokenizer.CreateForModel("gpt-4");
private ITokenizer? _tokenizerLib;
[Params(Strings.HelloWorld, Strings.KingLear, Strings.Bitcoin)]
@@ -31,11 +33,15 @@ public async Task GlobalSetup()
[Benchmark(Baseline = true)]
[BenchmarkCategory("Encode")]
- public List SharpTokenV2_0_1_Encode() => _sharpToken.Encode(Data);
+ public List SharpTokenV2_0_3_Encode() => _sharpToken.Encode(Data);
[Benchmark]
[BenchmarkCategory("Encode")]
- public List TiktokenSharpV1_0_9_Encode() => _tiktokenSharp.Encode(Data);
+ public List TiktokenSharpV1_1_5_Encode() => _tiktokenSharp.Encode(Data);
+
+ [Benchmark]
+ [BenchmarkCategory("Encode")]
+ public IReadOnlyCollection MicrosoftMLTokenizerV1_0_0_Encode() => _microsoftMlTiktoken.EncodeToIds(Data);
[Benchmark]
[BenchmarkCategory("Encode")]
@@ -48,11 +54,15 @@ public async Task GlobalSetup()
[Benchmark(Baseline = true)]
[BenchmarkCategory("CountTokens")]
- public int SharpTokenV2_0_1_() => _sharpToken.Encode(Data).Count;
+ public int SharpTokenV2_0_3_() => _sharpToken.Encode(Data).Count;
+
+ [Benchmark]
+ [BenchmarkCategory("CountTokens")]
+ public int TiktokenSharpV1_1_5_() => _tiktokenSharp.Encode(Data).Count;
[Benchmark]
[BenchmarkCategory("CountTokens")]
- public int TiktokenSharpV1_0_9_() => _tiktokenSharp.Encode(Data).Count;
+ public int MicrosoftMLTokenizerV1_0_0_() => _microsoftMlTiktoken.CountTokens(Data);
[Benchmark]
[BenchmarkCategory("CountTokens")]
diff --git a/src/benchmarks/Tiktoken.Benchmarks/Tiktoken.Benchmarks.csproj b/src/benchmarks/Tiktoken.Benchmarks/Tiktoken.Benchmarks.csproj
index a7ff03a..f34875a 100644
--- a/src/benchmarks/Tiktoken.Benchmarks/Tiktoken.Benchmarks.csproj
+++ b/src/benchmarks/Tiktoken.Benchmarks/Tiktoken.Benchmarks.csproj
@@ -9,6 +9,8 @@
+
+
diff --git a/src/libs/Directory.Build.props b/src/libs/Directory.Build.props
index 3fd8e42..4acde72 100644
--- a/src/libs/Directory.Build.props
+++ b/src/libs/Directory.Build.props
@@ -9,7 +9,7 @@
- 2.1.1
+ 2.2.0
The fastest tokenizer for GPT-3.5 and GPT-4 inspired by Tiktoken.
chatgpt;openai;tiktoken;tokens;gpt-4;gpt-3.5-turbo;cl100k_base;p50k_base
true