Skip to content

Commit

Permalink
feat: Added ability to get encoder/encoding by model name.
Browse files Browse the repository at this point in the history
  • Loading branch information
HavenDV committed May 18, 2024
1 parent 5ee22aa commit 93e1125
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 16 deletions.
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ We will be happy to accept any PR.
using Tiktoken.Encodings;
using Tiktoken;

var encoding = new O200KBase();
var encoder = new Encoder(encoding);
var encoder = Encoders.ForModel("gpt-4o"); // or explicitly new Encoder(new O200KBase())
var tokens = encoder.Encode("hello world"); // [15339, 1917]
var text = encoder.Decode(tokens); // hello world
var numberOfTokens = encoder.CountTokens(text); // 2
Expand Down
2 changes: 1 addition & 1 deletion src/libs/Directory.Build.props
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
</ItemGroup>

<PropertyGroup Label="Nuget">
<Version>2.0.0</Version>
<Version>2.0.1</Version>
<Description>The fastest tokenizer for GPT-3.5 and GPT-4 inspired by Tiktoken.</Description>
<PackageTags>chatgpt;openai;tiktoken;tokens;gpt-4;gpt-3.5-turbo;cl100k_base;p50k_base</PackageTags>
<GeneratePackageOnBuild Condition=" '$(Configuration)' == 'Release' ">true</GeneratePackageOnBuild>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,66 +2,70 @@

namespace Tiktoken;

// ReSharper disable InconsistentNaming

internal static class EncoderHelpers
/// <summary>
///
/// </summary>
public static class Encoders
{
/// <summary>
/// Returns encoding by model name.
/// Returns encoder by model name.
/// </summary>
/// <param name="modelName">gpt-3.5-turbo</param>
/// <returns></returns>
public static Encoder ForModel(string modelName)
{
return new Encoder(GetNameByModel(modelName));
return new Encoder(GetEncodingByModel(modelName));
}

/// <summary>
/// Returns encoding by model name or null.
/// Returns encoder by model name or null.
/// </summary>
/// <param name="modelName">gpt-3.5-turbo</param>
/// <returns></returns>
public static Encoder? TryForModel(string modelName)
{
var encoding = TryGetNameByModel(modelName);
var encoding = TryGetEncodingByModel(modelName);

return encoding == null
? null
: new Encoder(encoding);
}

private static Dictionary<string, Encodings.Encoding> ModelToEncoding { get; } = new()
private static Dictionary<string, Encoding> ModelToEncoding { get; } = new()
{
// chat
{ "gpt-4o", new O200KBase() },
{ "gpt-4", new Cl100KBase() },
{ "gpt-3.5-turbo", new Cl100KBase() },
{ "gpt-35-turbo", new Cl100KBase() }, // Azure deployment name

// embeddings
{ "text-embedding-ada-002", new Cl100KBase() },
{ "text-embedding-3-small", new Cl100KBase() },
{ "text-embedding-3-large", new Cl100KBase() },
};

/// <summary>
/// Returns encoding name by model name or null.
/// Returns encoding by model name or null.
/// </summary>
/// <param name="modelName">gpt-4 gpt-3.5-turbo ...</param>
/// <exception cref="ArgumentException"></exception>
/// <returns></returns>
public static Encoding? TryGetNameByModel(string modelName)
public static Encoding? TryGetEncodingByModel(string modelName)
{
return ModelToEncoding
.FirstOrDefault(a => modelName.StartsWith(a.Key, StringComparison.Ordinal)).Value;
}

/// <summary>
/// Returns encoding name by model name or throws exception.
/// Returns encoding by model name or throws exception.
/// </summary>
/// <param name="modelName">gpt-4 gpt-3.5-turbo ...</param>
/// <exception cref="ArgumentException"></exception>
/// <returns></returns>
public static Encoding GetNameByModel(string modelName)
public static Encoding GetEncodingByModel(string modelName)
{
return TryGetNameByModel(modelName) ??
return TryGetEncodingByModel(modelName) ??
throw new ArgumentException($"Model name {modelName} is not supported.");
}
}

0 comments on commit 93e1125

Please sign in to comment.