Skip to content

Commit

Permalink
benchmark: Added Microsoft.ML.Tokenizers.
Browse files Browse the repository at this point in the history
  • Loading branch information
HavenDV committed Nov 15, 2024
1 parent 4b1c5db commit 6413a0a
Show file tree
Hide file tree
Showing 6 changed files with 107 additions and 40 deletions.
76 changes: 41 additions & 35 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,45 +33,51 @@ You can view the reports for each version [here](benchmarks)
<!--BENCHMARKS_START-->
```
BenchmarkDotNet v0.13.12, macOS Sonoma 14.4.1 (23E224) [Darwin 23.4.0]
BenchmarkDotNet v0.14.0, macOS Sequoia 15.1 (24B83) [Darwin 24.1.0]
Apple M1 Pro, 1 CPU, 10 logical and 10 physical cores
.NET SDK 8.0.204
[Host] : .NET 8.0.4 (8.0.424.16909), Arm64 RyuJIT AdvSIMD
DefaultJob : .NET 8.0.4 (8.0.424.16909), Arm64 RyuJIT AdvSIMD
.NET SDK 9.0.100
[Host] : .NET 9.0.0 (9.0.24.52809), Arm64 RyuJIT AdvSIMD
DefaultJob : .NET 9.0.0 (9.0.24.52809), Arm64 RyuJIT AdvSIMD
```
| Method | Categories | Data | Mean | Median | Ratio | Gen0 | Gen1 | Gen2 | Allocated | Alloc Ratio |
|--------------------------- |------------ |-------------------- |-------------:|-------------:|------:|---------:|--------:|-------:|----------:|------------:|
| **SharpTokenV2_0_1_** | **CountTokens** | **1. (...)57. [19866]** | **632,817.1 ns** | **632,257.2 ns** | **1.00** | **2.9297** | **-** | **-** | **20115 B** | **1.00** |
| TiktokenSharpV1_0_9_ | CountTokens | 1. (...)57. [19866] | 463,840.3 ns | 458,851.3 ns | 0.74 | 64.4531 | 3.4180 | - | 404649 B | 20.12 |
| TokenizerLibV1_3_3_ | CountTokens | 1. (...)57. [19866] | 801,796.0 ns | 806,271.8 ns | 1.27 | 247.0703 | 98.6328 | 0.9766 | 1547675 B | 76.94 |
| Tiktoken_ | CountTokens | 1. (...)57. [19866] | 319,697.2 ns | 319,475.1 ns | 0.50 | 49.3164 | - | - | 309449 B | 15.38 |
| | | | | | | | | | | |
| **SharpTokenV2_0_1_** | **CountTokens** | **Hello, World!** | **478.1 ns** | **478.1 ns** | **1.00** | **0.0401** | **-** | **-** | **256 B** | **1.00** |
| TiktokenSharpV1_0_9_ | CountTokens | Hello, World! | 275.2 ns | 275.1 ns | 0.58 | 0.0505 | - | - | 320 B | 1.25 |
| TokenizerLibV1_3_3_ | CountTokens | Hello, World! | 498.1 ns | 497.4 ns | 1.04 | 0.2356 | - | - | 1480 B | 5.78 |
| Tiktoken_ | CountTokens | Hello, World! | 212.9 ns | 212.8 ns | 0.45 | 0.0420 | - | - | 264 B | 1.03 |
| | | | | | | | | | | |
| **SharpTokenV2_0_1_** | **CountTokens** | **King(...)edy. [275]** | **6,652.5 ns** | **6,651.9 ns** | **1.00** | **0.0763** | **-** | **-** | **520 B** | **1.00** |
| TiktokenSharpV1_0_9_ | CountTokens | King(...)edy. [275] | 4,774.2 ns | 4,781.1 ns | 0.72 | 0.8011 | - | - | 5064 B | 9.74 |
| TokenizerLibV1_3_3_ | CountTokens | King(...)edy. [275] | 7,261.6 ns | 7,241.6 ns | 1.09 | 3.0899 | 0.1450 | 0.0076 | 19344 B | 37.20 |
| Tiktoken_ | CountTokens | King(...)edy. [275] | 3,216.1 ns | 3,189.9 ns | 0.49 | 0.6447 | - | - | 4064 B | 7.82 |
| | | | | | | | | | | |
| **SharpTokenV2_0_1_Encode** | **Encode** | **1. (...)57. [19866]** | **613,700.9 ns** | **612,821.4 ns** | **1.00** | **2.9297** | **-** | **-** | **20115 B** | **1.00** |
| TiktokenSharpV1_0_9_Encode | Encode | 1. (...)57. [19866] | 444,436.3 ns | 444,298.4 ns | 0.72 | 64.4531 | 3.4180 | - | 404649 B | 20.12 |
| TokenizerLibV1_3_3_Encode | Encode | 1. (...)57. [19866] | 773,882.5 ns | 774,314.3 ns | 1.26 | 246.0938 | 85.9375 | - | 1547673 B | 76.94 |
| Tiktoken_Encode | Encode | 1. (...)57. [19866] | 335,482.3 ns | 333,936.4 ns | 0.55 | 59.5703 | 2.4414 | - | 375601 B | 18.67 |
| | | | | | | | | | | |
| **SharpTokenV2_0_1_Encode** | **Encode** | **Hello, World!** | **443.7 ns** | **436.8 ns** | **1.00** | **0.0405** | **-** | **-** | **256 B** | **1.00** |
| TiktokenSharpV1_0_9_Encode | Encode | Hello, World! | 300.4 ns | 299.4 ns | 0.67 | 0.0505 | - | - | 320 B | 1.25 |
| TokenizerLibV1_3_3_Encode | Encode | Hello, World! | 504.7 ns | 498.5 ns | 1.15 | 0.2356 | 0.0010 | - | 1480 B | 5.78 |
| Tiktoken_Encode | Encode | Hello, World! | 262.4 ns | 262.6 ns | 0.58 | 0.1030 | - | - | 648 B | 2.53 |
| | | | | | | | | | | |
| **SharpTokenV2_0_1_Encode** | **Encode** | **King(...)edy. [275]** | **6,784.3 ns** | **6,714.1 ns** | **1.00** | **0.0763** | **-** | **-** | **520 B** | **1.00** |
| TiktokenSharpV1_0_9_Encode | Encode | King(...)edy. [275] | 4,691.2 ns | 4,690.7 ns | 0.69 | 0.8011 | - | - | 5064 B | 9.74 |
| TokenizerLibV1_3_3_Encode | Encode | King(...)edy. [275] | 7,287.9 ns | 7,290.9 ns | 1.08 | 3.0823 | 0.1373 | - | 19344 B | 37.20 |
| Tiktoken_Encode | Encode | King(...)edy. [275] | 3,606.2 ns | 3,607.4 ns | 0.53 | 0.7973 | - | - | 5024 B | 9.66 |
| Method | Categories | Data | Mean | Ratio | Gen0 | Gen1 | Allocated | Alloc Ratio |
|---------------------------------- |------------ |-------------------- |-------------:|------:|---------:|--------:|----------:|------------:|
| **SharpTokenV2_0_3_** | **CountTokens** | **1. (...)57. [19866]** | **567,130.0 ns** | **1.00** | **2.9297** | **-** | **20115 B** | **1.00** |
| TiktokenSharpV1_1_5_ | CountTokens | 1. (...)57. [19866] | 483,976.7 ns | 0.85 | 64.4531 | 5.8594 | 404648 B | 20.12 |
| MicrosoftMLTokenizerV1_0_0_ | CountTokens | 1. (...)57. [19866] | 427,733.2 ns | 0.75 | - | - | 297 B | 0.01 |
| TokenizerLibV1_3_3_ | CountTokens | 1. (...)57. [19866] | 773,467.5 ns | 1.36 | 246.0938 | 83.9844 | 1547675 B | 76.94 |
| Tiktoken_ | CountTokens | 1. (...)57. [19866] | 271,564.3 ns | 0.48 | 23.4375 | - | 148313 B | 7.37 |
| | | | | | | | | |
| **SharpTokenV2_0_3_** | **CountTokens** | **Hello, World!** | **380.0 ns** | **1.00** | **0.0405** | **-** | **256 B** | **1.00** |
| TiktokenSharpV1_1_5_ | CountTokens | Hello, World! | 263.8 ns | 0.69 | 0.0505 | - | 320 B | 1.25 |
| MicrosoftMLTokenizerV1_0_0_ | CountTokens | Hello, World! | 305.7 ns | 0.80 | 0.0153 | - | 96 B | 0.38 |
| TokenizerLibV1_3_3_ | CountTokens | Hello, World! | 509.6 ns | 1.34 | 0.2356 | 0.0010 | 1480 B | 5.78 |
| Tiktoken_ | CountTokens | Hello, World! | 175.7 ns | 0.46 | 0.0191 | - | 120 B | 0.47 |
| | | | | | | | | |
| **SharpTokenV2_0_3_** | **CountTokens** | **King(...)edy. [275]** | **5,990.7 ns** | **1.00** | **0.0763** | **-** | **520 B** | **1.00** |
| TiktokenSharpV1_1_5_ | CountTokens | King(...)edy. [275] | 4,516.5 ns | 0.75 | 0.8011 | - | 5064 B | 9.74 |
| MicrosoftMLTokenizerV1_0_0_ | CountTokens | King(...)edy. [275] | 3,871.2 ns | 0.65 | 0.0153 | - | 96 B | 0.18 |
| TokenizerLibV1_3_3_ | CountTokens | King(...)edy. [275] | 7,465.8 ns | 1.25 | 3.0823 | 0.1373 | 19344 B | 37.20 |
| Tiktoken_ | CountTokens | King(...)edy. [275] | 2,744.5 ns | 0.46 | 0.3128 | - | 1976 B | 3.80 |
| | | | | | | | | |
| **SharpTokenV2_0_3_Encode** | **Encode** | **1. (...)57. [19866]** | **568,150.3 ns** | **1.00** | **2.9297** | **-** | **20115 B** | **1.00** |
| TiktokenSharpV1_1_5_Encode | Encode | 1. (...)57. [19866] | 444,972.1 ns | 0.78 | 64.4531 | 5.8594 | 404649 B | 20.12 |
| MicrosoftMLTokenizerV1_0_0_Encode | Encode | 1. (...)57. [19866] | 410,970.9 ns | 0.72 | 10.2539 | 0.4883 | 66137 B | 3.29 |
| TokenizerLibV1_3_3_Encode | Encode | 1. (...)57. [19866] | 770,068.9 ns | 1.36 | 246.0938 | 90.8203 | 1547675 B | 76.94 |
| Tiktoken_Encode | Encode | 1. (...)57. [19866] | 290,030.9 ns | 0.51 | 33.6914 | 1.4648 | 214465 B | 10.66 |
| | | | | | | | | |
| **SharpTokenV2_0_3_Encode** | **Encode** | **Hello, World!** | **381.2 ns** | **1.00** | **0.0405** | **-** | **256 B** | **1.00** |
| TiktokenSharpV1_1_5_Encode | Encode | Hello, World! | 260.2 ns | 0.68 | 0.0505 | - | 320 B | 1.25 |
| MicrosoftMLTokenizerV1_0_0_Encode | Encode | Hello, World! | 325.1 ns | 0.85 | 0.0267 | - | 168 B | 0.66 |
| TokenizerLibV1_3_3_Encode | Encode | Hello, World! | 511.6 ns | 1.34 | 0.2356 | - | 1480 B | 5.78 |
| Tiktoken_Encode | Encode | Hello, World! | 241.4 ns | 0.63 | 0.0801 | - | 504 B | 1.97 |
| | | | | | | | | |
| **SharpTokenV2_0_3_Encode** | **Encode** | **King(...)edy. [275]** | **5,957.3 ns** | **1.00** | **0.0763** | **-** | **520 B** | **1.00** |
| TiktokenSharpV1_1_5_Encode | Encode | King(...)edy. [275] | 4,523.8 ns | 0.76 | 0.8011 | - | 5064 B | 9.74 |
| MicrosoftMLTokenizerV1_0_0_Encode | Encode | King(...)edy. [275] | 4,069.8 ns | 0.68 | 0.1144 | - | 744 B | 1.43 |
| TokenizerLibV1_3_3_Encode | Encode | King(...)edy. [275] | 7,207.8 ns | 1.21 | 3.0823 | 0.1373 | 19344 B | 37.20 |
| Tiktoken_Encode | Encode | King(...)edy. [275] | 2,945.7 ns | 0.49 | 0.4654 | - | 2936 B | 5.65 |

<!--BENCHMARKS_END-->

Expand Down
47 changes: 47 additions & 0 deletions benchmarks/2.2.0.0_encode.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
```
BenchmarkDotNet v0.14.0, macOS Sequoia 15.1 (24B83) [Darwin 24.1.0]
Apple M1 Pro, 1 CPU, 10 logical and 10 physical cores
.NET SDK 9.0.100
[Host] : .NET 9.0.0 (9.0.24.52809), Arm64 RyuJIT AdvSIMD
DefaultJob : .NET 9.0.0 (9.0.24.52809), Arm64 RyuJIT AdvSIMD
```
| Method | Categories | Data | Mean | Ratio | Gen0 | Gen1 | Allocated | Alloc Ratio |
|---------------------------------- |------------ |-------------------- |-------------:|------:|---------:|--------:|----------:|------------:|
| **SharpTokenV2_0_3_** | **CountTokens** | **1. (...)57. [19866]** | **567,130.0 ns** | **1.00** | **2.9297** | **-** | **20115 B** | **1.00** |
| TiktokenSharpV1_1_5_ | CountTokens | 1. (...)57. [19866] | 483,976.7 ns | 0.85 | 64.4531 | 5.8594 | 404648 B | 20.12 |
| MicrosoftMLTokenizerV1_0_0_ | CountTokens | 1. (...)57. [19866] | 427,733.2 ns | 0.75 | - | - | 297 B | 0.01 |
| TokenizerLibV1_3_3_ | CountTokens | 1. (...)57. [19866] | 773,467.5 ns | 1.36 | 246.0938 | 83.9844 | 1547675 B | 76.94 |
| Tiktoken_ | CountTokens | 1. (...)57. [19866] | 271,564.3 ns | 0.48 | 23.4375 | - | 148313 B | 7.37 |
| | | | | | | | | |
| **SharpTokenV2_0_3_** | **CountTokens** | **Hello, World!** | **380.0 ns** | **1.00** | **0.0405** | **-** | **256 B** | **1.00** |
| TiktokenSharpV1_1_5_ | CountTokens | Hello, World! | 263.8 ns | 0.69 | 0.0505 | - | 320 B | 1.25 |
| MicrosoftMLTokenizerV1_0_0_ | CountTokens | Hello, World! | 305.7 ns | 0.80 | 0.0153 | - | 96 B | 0.38 |
| TokenizerLibV1_3_3_ | CountTokens | Hello, World! | 509.6 ns | 1.34 | 0.2356 | 0.0010 | 1480 B | 5.78 |
| Tiktoken_ | CountTokens | Hello, World! | 175.7 ns | 0.46 | 0.0191 | - | 120 B | 0.47 |
| | | | | | | | | |
| **SharpTokenV2_0_3_** | **CountTokens** | **King(...)edy. [275]** | **5,990.7 ns** | **1.00** | **0.0763** | **-** | **520 B** | **1.00** |
| TiktokenSharpV1_1_5_ | CountTokens | King(...)edy. [275] | 4,516.5 ns | 0.75 | 0.8011 | - | 5064 B | 9.74 |
| MicrosoftMLTokenizerV1_0_0_ | CountTokens | King(...)edy. [275] | 3,871.2 ns | 0.65 | 0.0153 | - | 96 B | 0.18 |
| TokenizerLibV1_3_3_ | CountTokens | King(...)edy. [275] | 7,465.8 ns | 1.25 | 3.0823 | 0.1373 | 19344 B | 37.20 |
| Tiktoken_ | CountTokens | King(...)edy. [275] | 2,744.5 ns | 0.46 | 0.3128 | - | 1976 B | 3.80 |
| | | | | | | | | |
| **SharpTokenV2_0_3_Encode** | **Encode** | **1. (...)57. [19866]** | **568,150.3 ns** | **1.00** | **2.9297** | **-** | **20115 B** | **1.00** |
| TiktokenSharpV1_1_5_Encode | Encode | 1. (...)57. [19866] | 444,972.1 ns | 0.78 | 64.4531 | 5.8594 | 404649 B | 20.12 |
| MicrosoftMLTokenizerV1_0_0_Encode | Encode | 1. (...)57. [19866] | 410,970.9 ns | 0.72 | 10.2539 | 0.4883 | 66137 B | 3.29 |
| TokenizerLibV1_3_3_Encode | Encode | 1. (...)57. [19866] | 770,068.9 ns | 1.36 | 246.0938 | 90.8203 | 1547675 B | 76.94 |
| Tiktoken_Encode | Encode | 1. (...)57. [19866] | 290,030.9 ns | 0.51 | 33.6914 | 1.4648 | 214465 B | 10.66 |
| | | | | | | | | |
| **SharpTokenV2_0_3_Encode** | **Encode** | **Hello, World!** | **381.2 ns** | **1.00** | **0.0405** | **-** | **256 B** | **1.00** |
| TiktokenSharpV1_1_5_Encode | Encode | Hello, World! | 260.2 ns | 0.68 | 0.0505 | - | 320 B | 1.25 |
| MicrosoftMLTokenizerV1_0_0_Encode | Encode | Hello, World! | 325.1 ns | 0.85 | 0.0267 | - | 168 B | 0.66 |
| TokenizerLibV1_3_3_Encode | Encode | Hello, World! | 511.6 ns | 1.34 | 0.2356 | - | 1480 B | 5.78 |
| Tiktoken_Encode | Encode | Hello, World! | 241.4 ns | 0.63 | 0.0801 | - | 504 B | 1.97 |
| | | | | | | | | |
| **SharpTokenV2_0_3_Encode** | **Encode** | **King(...)edy. [275]** | **5,957.3 ns** | **1.00** | **0.0763** | **-** | **520 B** | **1.00** |
| TiktokenSharpV1_1_5_Encode | Encode | King(...)edy. [275] | 4,523.8 ns | 0.76 | 0.8011 | - | 5064 B | 9.74 |
| MicrosoftMLTokenizerV1_0_0_Encode | Encode | King(...)edy. [275] | 4,069.8 ns | 0.68 | 0.1144 | - | 744 B | 1.43 |
| TokenizerLibV1_3_3_Encode | Encode | King(...)edy. [275] | 7,207.8 ns | 1.21 | 3.0823 | 0.1373 | 19344 B | 37.20 |
| Tiktoken_Encode | Encode | King(...)edy. [275] | 2,945.7 ns | 0.49 | 0.4654 | - | 2936 B | 5.65 |
2 changes: 2 additions & 0 deletions src/Directory.Packages.props
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageVersion>
<PackageVersion Include="Microsoft.DeepDev.TokenizerLib" Version="1.3.3" />
<PackageVersion Include="Microsoft.ML.Tokenizers" Version="1.0.0" />
<PackageVersion Include="Microsoft.ML.Tokenizers.Data.Cl100kBase" Version="1.0.0" />
<PackageVersion Include="MSTest" Version="3.6.3" />
<PackageVersion Include="PolySharp" Version="1.14.1">
<PrivateAssets>all</PrivateAssets>
Expand Down
18 changes: 14 additions & 4 deletions src/benchmarks/Tiktoken.Benchmarks/Benchmarks.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Configs;
using Microsoft.DeepDev;
using Microsoft.ML.Tokenizers;
using SharpToken;
using Tiktoken.Encodings;
using TiktokenSharp;
Expand All @@ -18,6 +19,7 @@ public class Benchmarks
private readonly GptEncoding _sharpToken = GptEncoding.GetEncoding("cl100k_base");
private readonly TikToken _tiktokenSharp = TikToken.GetEncoding("cl100k_base");
private readonly Encoder _tiktoken = new(new Cl100KBase());
private readonly Tokenizer _microsoftMlTiktoken = TiktokenTokenizer.CreateForModel("gpt-4");
private ITokenizer? _tokenizerLib;

[Params(Strings.HelloWorld, Strings.KingLear, Strings.Bitcoin)]
Expand All @@ -31,11 +33,15 @@ public async Task GlobalSetup()

[Benchmark(Baseline = true)]
[BenchmarkCategory("Encode")]
public List<int> SharpTokenV2_0_1_Encode() => _sharpToken.Encode(Data);
public List<int> SharpTokenV2_0_3_Encode() => _sharpToken.Encode(Data);

[Benchmark]
[BenchmarkCategory("Encode")]
public List<int> TiktokenSharpV1_0_9_Encode() => _tiktokenSharp.Encode(Data);
public List<int> TiktokenSharpV1_1_5_Encode() => _tiktokenSharp.Encode(Data);

[Benchmark]
[BenchmarkCategory("Encode")]
public IReadOnlyCollection<int> MicrosoftMLTokenizerV1_0_0_Encode() => _microsoftMlTiktoken.EncodeToIds(Data);

[Benchmark]
[BenchmarkCategory("Encode")]
Expand All @@ -48,11 +54,15 @@ public async Task GlobalSetup()

[Benchmark(Baseline = true)]
[BenchmarkCategory("CountTokens")]
public int SharpTokenV2_0_1_() => _sharpToken.Encode(Data).Count;
public int SharpTokenV2_0_3_() => _sharpToken.Encode(Data).Count;

[Benchmark]
[BenchmarkCategory("CountTokens")]
public int TiktokenSharpV1_1_5_() => _tiktokenSharp.Encode(Data).Count;

[Benchmark]
[BenchmarkCategory("CountTokens")]
public int TiktokenSharpV1_0_9_() => _tiktokenSharp.Encode(Data).Count;
public int MicrosoftMLTokenizerV1_0_0_() => _microsoftMlTiktoken.CountTokens(Data);

[Benchmark]
[BenchmarkCategory("CountTokens")]
Expand Down
Loading

0 comments on commit 6413a0a

Please sign in to comment.