Skip to content

Commit

Permalink
Reduce allocations in EncodingLoader.
Browse files Browse the repository at this point in the history
- Avoid garbage strings from splitting on space.
- Avoid allocations from using LINQ in a hot loop.

Signed-off-by: Bradley Grainger <[email protected]>
  • Loading branch information
bgrainger authored and HavenDV committed Nov 9, 2024
1 parent f7e7404 commit 1679b57
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 3 deletions.
14 changes: 13 additions & 1 deletion src/libs/Tiktoken.Core/CoreBPE.cs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,19 @@ public CoreBpe(
Encoder = encoder;
FastEncoder = Encoder
.ToDictionary(
static x => new string(x.Key.Select(y => (char) y).ToArray()),
#if NETSTANDARD2_1_OR_GREATER || NET6_0_OR_GREATER
static x =>
{
Span<char> chars = stackalloc char[x.Key.Length];
for (var i = 0; i < x.Key.Length; i++)
{
chars[i] = (char)x.Key[i];
}
return new string(chars);
},
#else
static x => new string(x.Key.Select(static y => (char) y).ToArray()),
#endif
static x => x.Value);
SpecialTokensEncoder = specialTokensEncoder;

Expand Down
22 changes: 20 additions & 2 deletions src/libs/Tiktoken.Encodings.Abstractions/EncodingLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public static Dictionary<byte[], int> LoadEncodingFromManifestResource(
assembly.GetManifestResourceStream(resourcePath) ??
throw new InvalidOperationException("Resource not found.");
using var reader = new StreamReader(stream);

var lines = new List<string>();
while (reader.ReadLine() is { } line)
{
Expand All @@ -53,7 +53,11 @@ public static Dictionary<byte[], int> LoadEncodingFromLines(
string name)
{
lines = lines ?? throw new ArgumentNullException(nameof(lines));


#if NET7_0_OR_GREATER
Span<Range> tokens = stackalloc Range[3];
Span<byte> bytes = stackalloc byte[256];
#endif
var dictionary = new Dictionary<byte[], int>(new ByteArrayComparer());
foreach (var line in lines)
{
Expand All @@ -62,14 +66,28 @@ public static Dictionary<byte[], int> LoadEncodingFromLines(
continue;
}

#if NET7_0_OR_GREATER
var splitCount = line.AsSpan().Split(tokens, ' ');
if (splitCount != 2)
{
throw new FormatException($"Invalid file format: {name}");
}
#else
var tokens = line.Split(' ');
if (tokens.Length != 2)
{
throw new FormatException($"Invalid file format: {name}");
}
#endif

#if NET7_0_OR_GREATER
Convert.TryFromBase64Chars(line.AsSpan(tokens[0]), bytes, out var bytesWritten);
var tokenBytes = bytes.Slice(0, bytesWritten).ToArray();
var rank = int.Parse(line.AsSpan(tokens[1]), CultureInfo.InvariantCulture);
#else
var tokenBytes = Convert.FromBase64String(tokens[0]);
var rank = int.Parse(tokens[1], CultureInfo.InvariantCulture);
#endif
dictionary[tokenBytes] = rank;
}

Expand Down

0 comments on commit 1679b57

Please sign in to comment.