diff --git a/README.md b/README.md index 86f9298..293b02a 100644 --- a/README.md +++ b/README.md @@ -33,45 +33,51 @@ You can view the reports for each version [here](benchmarks) ``` -BenchmarkDotNet v0.13.12, macOS Sonoma 14.4.1 (23E224) [Darwin 23.4.0] +BenchmarkDotNet v0.14.0, macOS Sequoia 15.1 (24B83) [Darwin 24.1.0] Apple M1 Pro, 1 CPU, 10 logical and 10 physical cores -.NET SDK 8.0.204 - [Host] : .NET 8.0.4 (8.0.424.16909), Arm64 RyuJIT AdvSIMD - DefaultJob : .NET 8.0.4 (8.0.424.16909), Arm64 RyuJIT AdvSIMD +.NET SDK 9.0.100 + [Host] : .NET 9.0.0 (9.0.24.52809), Arm64 RyuJIT AdvSIMD + DefaultJob : .NET 9.0.0 (9.0.24.52809), Arm64 RyuJIT AdvSIMD ``` -| Method | Categories | Data | Mean | Median | Ratio | Gen0 | Gen1 | Gen2 | Allocated | Alloc Ratio | -|--------------------------- |------------ |-------------------- |-------------:|-------------:|------:|---------:|--------:|-------:|----------:|------------:| -| **SharpTokenV2_0_1_** | **CountTokens** | **1. (...)57. [19866]** | **632,817.1 ns** | **632,257.2 ns** | **1.00** | **2.9297** | **-** | **-** | **20115 B** | **1.00** | -| TiktokenSharpV1_0_9_ | CountTokens | 1. (...)57. [19866] | 463,840.3 ns | 458,851.3 ns | 0.74 | 64.4531 | 3.4180 | - | 404649 B | 20.12 | -| TokenizerLibV1_3_3_ | CountTokens | 1. (...)57. [19866] | 801,796.0 ns | 806,271.8 ns | 1.27 | 247.0703 | 98.6328 | 0.9766 | 1547675 B | 76.94 | -| Tiktoken_ | CountTokens | 1. (...)57. [19866] | 319,697.2 ns | 319,475.1 ns | 0.50 | 49.3164 | - | - | 309449 B | 15.38 | -| | | | | | | | | | | | -| **SharpTokenV2_0_1_** | **CountTokens** | **Hello, World!** | **478.1 ns** | **478.1 ns** | **1.00** | **0.0401** | **-** | **-** | **256 B** | **1.00** | -| TiktokenSharpV1_0_9_ | CountTokens | Hello, World! | 275.2 ns | 275.1 ns | 0.58 | 0.0505 | - | - | 320 B | 1.25 | -| TokenizerLibV1_3_3_ | CountTokens | Hello, World! | 498.1 ns | 497.4 ns | 1.04 | 0.2356 | - | - | 1480 B | 5.78 | -| Tiktoken_ | CountTokens | Hello, World! | 212.9 ns | 212.8 ns | 0.45 | 0.0420 | - | - | 264 B | 1.03 | -| | | | | | | | | | | | -| **SharpTokenV2_0_1_** | **CountTokens** | **King(...)edy. [275]** | **6,652.5 ns** | **6,651.9 ns** | **1.00** | **0.0763** | **-** | **-** | **520 B** | **1.00** | -| TiktokenSharpV1_0_9_ | CountTokens | King(...)edy. [275] | 4,774.2 ns | 4,781.1 ns | 0.72 | 0.8011 | - | - | 5064 B | 9.74 | -| TokenizerLibV1_3_3_ | CountTokens | King(...)edy. [275] | 7,261.6 ns | 7,241.6 ns | 1.09 | 3.0899 | 0.1450 | 0.0076 | 19344 B | 37.20 | -| Tiktoken_ | CountTokens | King(...)edy. [275] | 3,216.1 ns | 3,189.9 ns | 0.49 | 0.6447 | - | - | 4064 B | 7.82 | -| | | | | | | | | | | | -| **SharpTokenV2_0_1_Encode** | **Encode** | **1. (...)57. [19866]** | **613,700.9 ns** | **612,821.4 ns** | **1.00** | **2.9297** | **-** | **-** | **20115 B** | **1.00** | -| TiktokenSharpV1_0_9_Encode | Encode | 1. (...)57. [19866] | 444,436.3 ns | 444,298.4 ns | 0.72 | 64.4531 | 3.4180 | - | 404649 B | 20.12 | -| TokenizerLibV1_3_3_Encode | Encode | 1. (...)57. [19866] | 773,882.5 ns | 774,314.3 ns | 1.26 | 246.0938 | 85.9375 | - | 1547673 B | 76.94 | -| Tiktoken_Encode | Encode | 1. (...)57. [19866] | 335,482.3 ns | 333,936.4 ns | 0.55 | 59.5703 | 2.4414 | - | 375601 B | 18.67 | -| | | | | | | | | | | | -| **SharpTokenV2_0_1_Encode** | **Encode** | **Hello, World!** | **443.7 ns** | **436.8 ns** | **1.00** | **0.0405** | **-** | **-** | **256 B** | **1.00** | -| TiktokenSharpV1_0_9_Encode | Encode | Hello, World! | 300.4 ns | 299.4 ns | 0.67 | 0.0505 | - | - | 320 B | 1.25 | -| TokenizerLibV1_3_3_Encode | Encode | Hello, World! | 504.7 ns | 498.5 ns | 1.15 | 0.2356 | 0.0010 | - | 1480 B | 5.78 | -| Tiktoken_Encode | Encode | Hello, World! | 262.4 ns | 262.6 ns | 0.58 | 0.1030 | - | - | 648 B | 2.53 | -| | | | | | | | | | | | -| **SharpTokenV2_0_1_Encode** | **Encode** | **King(...)edy. [275]** | **6,784.3 ns** | **6,714.1 ns** | **1.00** | **0.0763** | **-** | **-** | **520 B** | **1.00** | -| TiktokenSharpV1_0_9_Encode | Encode | King(...)edy. [275] | 4,691.2 ns | 4,690.7 ns | 0.69 | 0.8011 | - | - | 5064 B | 9.74 | -| TokenizerLibV1_3_3_Encode | Encode | King(...)edy. [275] | 7,287.9 ns | 7,290.9 ns | 1.08 | 3.0823 | 0.1373 | - | 19344 B | 37.20 | -| Tiktoken_Encode | Encode | King(...)edy. [275] | 3,606.2 ns | 3,607.4 ns | 0.53 | 0.7973 | - | - | 5024 B | 9.66 | +| Method | Categories | Data | Mean | Ratio | Gen0 | Gen1 | Allocated | Alloc Ratio | +|---------------------------------- |------------ |-------------------- |-------------:|------:|---------:|--------:|----------:|------------:| +| **SharpTokenV2_0_3_** | **CountTokens** | **1. (...)57. [19866]** | **567,130.0 ns** | **1.00** | **2.9297** | **-** | **20115 B** | **1.00** | +| TiktokenSharpV1_1_5_ | CountTokens | 1. (...)57. [19866] | 483,976.7 ns | 0.85 | 64.4531 | 5.8594 | 404648 B | 20.12 | +| MicrosoftMLTokenizerV1_0_0_ | CountTokens | 1. (...)57. [19866] | 427,733.2 ns | 0.75 | - | - | 297 B | 0.01 | +| TokenizerLibV1_3_3_ | CountTokens | 1. (...)57. [19866] | 773,467.5 ns | 1.36 | 246.0938 | 83.9844 | 1547675 B | 76.94 | +| Tiktoken_ | CountTokens | 1. (...)57. [19866] | 271,564.3 ns | 0.48 | 23.4375 | - | 148313 B | 7.37 | +| | | | | | | | | | +| **SharpTokenV2_0_3_** | **CountTokens** | **Hello, World!** | **380.0 ns** | **1.00** | **0.0405** | **-** | **256 B** | **1.00** | +| TiktokenSharpV1_1_5_ | CountTokens | Hello, World! | 263.8 ns | 0.69 | 0.0505 | - | 320 B | 1.25 | +| MicrosoftMLTokenizerV1_0_0_ | CountTokens | Hello, World! | 305.7 ns | 0.80 | 0.0153 | - | 96 B | 0.38 | +| TokenizerLibV1_3_3_ | CountTokens | Hello, World! | 509.6 ns | 1.34 | 0.2356 | 0.0010 | 1480 B | 5.78 | +| Tiktoken_ | CountTokens | Hello, World! | 175.7 ns | 0.46 | 0.0191 | - | 120 B | 0.47 | +| | | | | | | | | | +| **SharpTokenV2_0_3_** | **CountTokens** | **King(...)edy. [275]** | **5,990.7 ns** | **1.00** | **0.0763** | **-** | **520 B** | **1.00** | +| TiktokenSharpV1_1_5_ | CountTokens | King(...)edy. [275] | 4,516.5 ns | 0.75 | 0.8011 | - | 5064 B | 9.74 | +| MicrosoftMLTokenizerV1_0_0_ | CountTokens | King(...)edy. [275] | 3,871.2 ns | 0.65 | 0.0153 | - | 96 B | 0.18 | +| TokenizerLibV1_3_3_ | CountTokens | King(...)edy. [275] | 7,465.8 ns | 1.25 | 3.0823 | 0.1373 | 19344 B | 37.20 | +| Tiktoken_ | CountTokens | King(...)edy. [275] | 2,744.5 ns | 0.46 | 0.3128 | - | 1976 B | 3.80 | +| | | | | | | | | | +| **SharpTokenV2_0_3_Encode** | **Encode** | **1. (...)57. [19866]** | **568,150.3 ns** | **1.00** | **2.9297** | **-** | **20115 B** | **1.00** | +| TiktokenSharpV1_1_5_Encode | Encode | 1. (...)57. [19866] | 444,972.1 ns | 0.78 | 64.4531 | 5.8594 | 404649 B | 20.12 | +| MicrosoftMLTokenizerV1_0_0_Encode | Encode | 1. (...)57. [19866] | 410,970.9 ns | 0.72 | 10.2539 | 0.4883 | 66137 B | 3.29 | +| TokenizerLibV1_3_3_Encode | Encode | 1. (...)57. [19866] | 770,068.9 ns | 1.36 | 246.0938 | 90.8203 | 1547675 B | 76.94 | +| Tiktoken_Encode | Encode | 1. (...)57. [19866] | 290,030.9 ns | 0.51 | 33.6914 | 1.4648 | 214465 B | 10.66 | +| | | | | | | | | | +| **SharpTokenV2_0_3_Encode** | **Encode** | **Hello, World!** | **381.2 ns** | **1.00** | **0.0405** | **-** | **256 B** | **1.00** | +| TiktokenSharpV1_1_5_Encode | Encode | Hello, World! | 260.2 ns | 0.68 | 0.0505 | - | 320 B | 1.25 | +| MicrosoftMLTokenizerV1_0_0_Encode | Encode | Hello, World! | 325.1 ns | 0.85 | 0.0267 | - | 168 B | 0.66 | +| TokenizerLibV1_3_3_Encode | Encode | Hello, World! | 511.6 ns | 1.34 | 0.2356 | - | 1480 B | 5.78 | +| Tiktoken_Encode | Encode | Hello, World! | 241.4 ns | 0.63 | 0.0801 | - | 504 B | 1.97 | +| | | | | | | | | | +| **SharpTokenV2_0_3_Encode** | **Encode** | **King(...)edy. [275]** | **5,957.3 ns** | **1.00** | **0.0763** | **-** | **520 B** | **1.00** | +| TiktokenSharpV1_1_5_Encode | Encode | King(...)edy. [275] | 4,523.8 ns | 0.76 | 0.8011 | - | 5064 B | 9.74 | +| MicrosoftMLTokenizerV1_0_0_Encode | Encode | King(...)edy. [275] | 4,069.8 ns | 0.68 | 0.1144 | - | 744 B | 1.43 | +| TokenizerLibV1_3_3_Encode | Encode | King(...)edy. [275] | 7,207.8 ns | 1.21 | 3.0823 | 0.1373 | 19344 B | 37.20 | +| Tiktoken_Encode | Encode | King(...)edy. [275] | 2,945.7 ns | 0.49 | 0.4654 | - | 2936 B | 5.65 | diff --git a/benchmarks/2.2.0.0_encode.md b/benchmarks/2.2.0.0_encode.md new file mode 100644 index 0000000..02534dd --- /dev/null +++ b/benchmarks/2.2.0.0_encode.md @@ -0,0 +1,47 @@ +``` + +BenchmarkDotNet v0.14.0, macOS Sequoia 15.1 (24B83) [Darwin 24.1.0] +Apple M1 Pro, 1 CPU, 10 logical and 10 physical cores +.NET SDK 9.0.100 + [Host] : .NET 9.0.0 (9.0.24.52809), Arm64 RyuJIT AdvSIMD + DefaultJob : .NET 9.0.0 (9.0.24.52809), Arm64 RyuJIT AdvSIMD + + +``` +| Method | Categories | Data | Mean | Ratio | Gen0 | Gen1 | Allocated | Alloc Ratio | +|---------------------------------- |------------ |-------------------- |-------------:|------:|---------:|--------:|----------:|------------:| +| **SharpTokenV2_0_3_** | **CountTokens** | **1. (...)57. [19866]** | **567,130.0 ns** | **1.00** | **2.9297** | **-** | **20115 B** | **1.00** | +| TiktokenSharpV1_1_5_ | CountTokens | 1. (...)57. [19866] | 483,976.7 ns | 0.85 | 64.4531 | 5.8594 | 404648 B | 20.12 | +| MicrosoftMLTokenizerV1_0_0_ | CountTokens | 1. (...)57. [19866] | 427,733.2 ns | 0.75 | - | - | 297 B | 0.01 | +| TokenizerLibV1_3_3_ | CountTokens | 1. (...)57. [19866] | 773,467.5 ns | 1.36 | 246.0938 | 83.9844 | 1547675 B | 76.94 | +| Tiktoken_ | CountTokens | 1. (...)57. [19866] | 271,564.3 ns | 0.48 | 23.4375 | - | 148313 B | 7.37 | +| | | | | | | | | | +| **SharpTokenV2_0_3_** | **CountTokens** | **Hello, World!** | **380.0 ns** | **1.00** | **0.0405** | **-** | **256 B** | **1.00** | +| TiktokenSharpV1_1_5_ | CountTokens | Hello, World! | 263.8 ns | 0.69 | 0.0505 | - | 320 B | 1.25 | +| MicrosoftMLTokenizerV1_0_0_ | CountTokens | Hello, World! | 305.7 ns | 0.80 | 0.0153 | - | 96 B | 0.38 | +| TokenizerLibV1_3_3_ | CountTokens | Hello, World! | 509.6 ns | 1.34 | 0.2356 | 0.0010 | 1480 B | 5.78 | +| Tiktoken_ | CountTokens | Hello, World! | 175.7 ns | 0.46 | 0.0191 | - | 120 B | 0.47 | +| | | | | | | | | | +| **SharpTokenV2_0_3_** | **CountTokens** | **King(...)edy. [275]** | **5,990.7 ns** | **1.00** | **0.0763** | **-** | **520 B** | **1.00** | +| TiktokenSharpV1_1_5_ | CountTokens | King(...)edy. [275] | 4,516.5 ns | 0.75 | 0.8011 | - | 5064 B | 9.74 | +| MicrosoftMLTokenizerV1_0_0_ | CountTokens | King(...)edy. [275] | 3,871.2 ns | 0.65 | 0.0153 | - | 96 B | 0.18 | +| TokenizerLibV1_3_3_ | CountTokens | King(...)edy. [275] | 7,465.8 ns | 1.25 | 3.0823 | 0.1373 | 19344 B | 37.20 | +| Tiktoken_ | CountTokens | King(...)edy. [275] | 2,744.5 ns | 0.46 | 0.3128 | - | 1976 B | 3.80 | +| | | | | | | | | | +| **SharpTokenV2_0_3_Encode** | **Encode** | **1. (...)57. [19866]** | **568,150.3 ns** | **1.00** | **2.9297** | **-** | **20115 B** | **1.00** | +| TiktokenSharpV1_1_5_Encode | Encode | 1. (...)57. [19866] | 444,972.1 ns | 0.78 | 64.4531 | 5.8594 | 404649 B | 20.12 | +| MicrosoftMLTokenizerV1_0_0_Encode | Encode | 1. (...)57. [19866] | 410,970.9 ns | 0.72 | 10.2539 | 0.4883 | 66137 B | 3.29 | +| TokenizerLibV1_3_3_Encode | Encode | 1. (...)57. [19866] | 770,068.9 ns | 1.36 | 246.0938 | 90.8203 | 1547675 B | 76.94 | +| Tiktoken_Encode | Encode | 1. (...)57. [19866] | 290,030.9 ns | 0.51 | 33.6914 | 1.4648 | 214465 B | 10.66 | +| | | | | | | | | | +| **SharpTokenV2_0_3_Encode** | **Encode** | **Hello, World!** | **381.2 ns** | **1.00** | **0.0405** | **-** | **256 B** | **1.00** | +| TiktokenSharpV1_1_5_Encode | Encode | Hello, World! | 260.2 ns | 0.68 | 0.0505 | - | 320 B | 1.25 | +| MicrosoftMLTokenizerV1_0_0_Encode | Encode | Hello, World! | 325.1 ns | 0.85 | 0.0267 | - | 168 B | 0.66 | +| TokenizerLibV1_3_3_Encode | Encode | Hello, World! | 511.6 ns | 1.34 | 0.2356 | - | 1480 B | 5.78 | +| Tiktoken_Encode | Encode | Hello, World! | 241.4 ns | 0.63 | 0.0801 | - | 504 B | 1.97 | +| | | | | | | | | | +| **SharpTokenV2_0_3_Encode** | **Encode** | **King(...)edy. [275]** | **5,957.3 ns** | **1.00** | **0.0763** | **-** | **520 B** | **1.00** | +| TiktokenSharpV1_1_5_Encode | Encode | King(...)edy. [275] | 4,523.8 ns | 0.76 | 0.8011 | - | 5064 B | 9.74 | +| MicrosoftMLTokenizerV1_0_0_Encode | Encode | King(...)edy. [275] | 4,069.8 ns | 0.68 | 0.1144 | - | 744 B | 1.43 | +| TokenizerLibV1_3_3_Encode | Encode | King(...)edy. [275] | 7,207.8 ns | 1.21 | 3.0823 | 0.1373 | 19344 B | 37.20 | +| Tiktoken_Encode | Encode | King(...)edy. [275] | 2,945.7 ns | 0.49 | 0.4654 | - | 2936 B | 5.65 | diff --git a/src/Directory.Packages.props b/src/Directory.Packages.props index 570b442..3d44e7c 100644 --- a/src/Directory.Packages.props +++ b/src/Directory.Packages.props @@ -12,6 +12,8 @@ runtime; build; native; contentfiles; analyzers; buildtransitive + + all diff --git a/src/benchmarks/Tiktoken.Benchmarks/Benchmarks.cs b/src/benchmarks/Tiktoken.Benchmarks/Benchmarks.cs index f917f37..60c50a2 100644 --- a/src/benchmarks/Tiktoken.Benchmarks/Benchmarks.cs +++ b/src/benchmarks/Tiktoken.Benchmarks/Benchmarks.cs @@ -1,6 +1,7 @@ using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Configs; using Microsoft.DeepDev; +using Microsoft.ML.Tokenizers; using SharpToken; using Tiktoken.Encodings; using TiktokenSharp; @@ -18,6 +19,7 @@ public class Benchmarks private readonly GptEncoding _sharpToken = GptEncoding.GetEncoding("cl100k_base"); private readonly TikToken _tiktokenSharp = TikToken.GetEncoding("cl100k_base"); private readonly Encoder _tiktoken = new(new Cl100KBase()); + private readonly Tokenizer _microsoftMlTiktoken = TiktokenTokenizer.CreateForModel("gpt-4"); private ITokenizer? _tokenizerLib; [Params(Strings.HelloWorld, Strings.KingLear, Strings.Bitcoin)] @@ -31,11 +33,15 @@ public async Task GlobalSetup() [Benchmark(Baseline = true)] [BenchmarkCategory("Encode")] - public List SharpTokenV2_0_1_Encode() => _sharpToken.Encode(Data); + public List SharpTokenV2_0_3_Encode() => _sharpToken.Encode(Data); [Benchmark] [BenchmarkCategory("Encode")] - public List TiktokenSharpV1_0_9_Encode() => _tiktokenSharp.Encode(Data); + public List TiktokenSharpV1_1_5_Encode() => _tiktokenSharp.Encode(Data); + + [Benchmark] + [BenchmarkCategory("Encode")] + public IReadOnlyCollection MicrosoftMLTokenizerV1_0_0_Encode() => _microsoftMlTiktoken.EncodeToIds(Data); [Benchmark] [BenchmarkCategory("Encode")] @@ -48,11 +54,15 @@ public async Task GlobalSetup() [Benchmark(Baseline = true)] [BenchmarkCategory("CountTokens")] - public int SharpTokenV2_0_1_() => _sharpToken.Encode(Data).Count; + public int SharpTokenV2_0_3_() => _sharpToken.Encode(Data).Count; + + [Benchmark] + [BenchmarkCategory("CountTokens")] + public int TiktokenSharpV1_1_5_() => _tiktokenSharp.Encode(Data).Count; [Benchmark] [BenchmarkCategory("CountTokens")] - public int TiktokenSharpV1_0_9_() => _tiktokenSharp.Encode(Data).Count; + public int MicrosoftMLTokenizerV1_0_0_() => _microsoftMlTiktoken.CountTokens(Data); [Benchmark] [BenchmarkCategory("CountTokens")] diff --git a/src/benchmarks/Tiktoken.Benchmarks/Tiktoken.Benchmarks.csproj b/src/benchmarks/Tiktoken.Benchmarks/Tiktoken.Benchmarks.csproj index a7ff03a..f34875a 100644 --- a/src/benchmarks/Tiktoken.Benchmarks/Tiktoken.Benchmarks.csproj +++ b/src/benchmarks/Tiktoken.Benchmarks/Tiktoken.Benchmarks.csproj @@ -9,6 +9,8 @@ + + diff --git a/src/libs/Directory.Build.props b/src/libs/Directory.Build.props index 3fd8e42..4acde72 100644 --- a/src/libs/Directory.Build.props +++ b/src/libs/Directory.Build.props @@ -9,7 +9,7 @@ - 2.1.1 + 2.2.0 The fastest tokenizer for GPT-3.5 and GPT-4 inspired by Tiktoken. chatgpt;openai;tiktoken;tokens;gpt-4;gpt-3.5-turbo;cl100k_base;p50k_base true