Update docs (#50)

elixir-nx · Aug 8, 2023 · 777cf46 · 777cf46
1 parent 15b9cdf
commit 777cf46
Show file tree

Hide file tree

Showing 21 changed files with 688 additions and 615 deletions.
diff --git a/README.md b/README.md
@@ -4,15 +4,9 @@
 
 Elixir bindings for [Hugging Face Tokenizers](https://github.com/huggingface/tokenizers).
 
-## Getting started
+## Installation
 
-In order to use `Tokenizers`, you will need Elixir installed. Then create an Elixir project via the `mix` build tool:
-
-```
-$ mix new my_app
-```
-
-Then you can add `Tokenizers` as dependency in your `mix.exs`.
+You can add `:tokenizers` as dependency in your `mix.exs`:
 
 ```elixir
 def deps do
@@ -30,26 +24,27 @@ Mix.install([
 ])
 ```
 
-## Quick example
+## Example
+
+You can use any pre-trained tokenizer from any model repo on Hugging Face Hub, such as [bert-base-cased](https://huggingface.co/bert-base-cased).
 
 ```elixir
-# Go get a tokenizer -- any from the Hugging Face models repo will do
 {:ok, tokenizer} = Tokenizers.Tokenizer.from_pretrained("bert-base-cased")
 {:ok, encoding} = Tokenizers.Tokenizer.encode(tokenizer, "Hello there!")
 Tokenizers.Encoding.get_tokens(encoding)
-# ["Hello", "there", "!"]
+#=> ["Hello", "there", "!"]
 Tokenizers.Encoding.get_ids(encoding)
-# [8667, 1175, 106]
+#=> [8667, 1175, 106]
 ```
 
 The [notebooks](./notebooks) directory has [an introductory Livebook](./notebooks/pretrained.livemd) to give you a feel for the API.
 
 ## Contributing
 
-Tokenizers uses Rust to call functionality from the Hugging Face Tokenizers library. While 
-Rust is not necessary to use Tokenizers as a package, you need Rust tooling installed on 
-your machine if you want to compile from source, which is the case when contributing to 
-Tokenizers. In particular, you will need Rust Stable, which can be installed with 
+Tokenizers uses Rust to call functionality from the Hugging Face Tokenizers library. While
+Rust is not necessary to use Tokenizers as a package, you need Rust tooling installed on
+your machine if you want to compile from source, which is the case when contributing to
+Tokenizers. In particular, you will need Rust Stable, which can be installed with
 [Rustup](https://rust-lang.github.io/rustup/installation/index.html).
 
 ## License

diff --git a/lib/tokenizers.ex b/lib/tokenizers.ex
@@ -4,16 +4,19 @@ defmodule Tokenizers do
 
   Hugging Face describes the Tokenizers library as:
 
-  > Fast State-of-the-art tokenizers, optimized for both research and production
+  > Fast State-of-the-art tokenizers, optimized for both research and
+  > production
   >
-  > 🤗 Tokenizers provides an implementation of today’s most used tokenizers, with a focus on performance and versatility. These tokenizers are also used in 🤗 Transformers.
+  > 🤗 Tokenizers provides an implementation of today’s most used
+  > tokenizers, with a focus on performance and versatility. These
+  > tokenizers are also used in 🤗 Transformers.
 
-  This library has bindings to use pretrained tokenizers. Support for building and training
-  a tokenizer from scratch is forthcoming.
+  A tokenizer is effectively a pipeline of transformations that take
+  a text input and return an encoded version of that text (`t:Tokenizers.Encoding.t/0`).
 
-  A tokenizer is effectively a pipeline of transforms to take some input text and return a
-  `Tokenizers.Encoding.t()`. The main entrypoint to this library is the `Tokenizers.Tokenizer`
-  module, which holds the `Tokenizers.Tokenizer.t()` struct, a container holding the constituent
-  parts of the pipeline. Most functionality is there.
+  The main entrypoint to this library is the `Tokenizers.Tokenizer`
+  module, which defines the `t:Tokenizers.Tokenizer.t/0` struct, a
+  container holding the constituent parts of the pipeline. Most
+  functionality is in that module.
   """
 end
diff --git a/lib/tokenizers/added_token.ex b/lib/tokenizers/added_token.ex
@@ -1,53 +1,51 @@
 defmodule Tokenizers.AddedToken do
   @moduledoc """
-  This struct represents AddedTokens
+  This struct represents a token added to tokenizer vocabulary.
   """
 
   @type t() :: %__MODULE__{resource: reference()}
   defstruct [:resource]
 
-  @typedoc """
-  Options for added token initialisation. All options can be ommited.
-  """
-  @type opts() :: [
-          special: boolean(),
-          single_word: boolean(),
-          lstrip: boolean(),
-          rstrip: boolean(),
-          normalized: boolean()
-        ]
-
   @doc """
-  Create a new AddedToken.
+  Builds a new added token.
+
+  ## Options
+
+    * `:special` - defines whether this token is a special token.
+      Defaults to `false`
 
-  * `:special` (default `false`) - defines whether this token is a special token.
+    * `:single_word` - defines whether this token should only match
+      single words. If `true`, this token will never match inside of a
+      word. For example the token `ing` would match on `tokenizing` if
+      this option is `false`. The notion of ”inside of a word” is
+      defined by the word boundaries pattern in regular expressions
+      (i.e. the token should start and end with word boundaries).
+      Defaults to `false`
 
-  * `:single_word` (default `false`) - defines whether this token should only match single words.
-    If `true`, this token will never match inside of a word. For example the token `ing` would
-    match on `tokenizing` if this option is `false`, but not if it is `true`.
-    The notion of ”inside of a word” is defined by the word boundaries pattern
-    in regular expressions (i.e. the token should start and end with word boundaries).
+    * `:lstrip` - defines whether this token should strip all potential
+      whitespace on its left side. If `true`, this token will greedily
+      match any whitespace on its left. For example if we try to match
+      the token `[MASK]` with `lstrip=true`, in the text `"I saw a [MASK]"`,
+      we would match on `" [MASK]"`. (Note the space on the left).
+      Defaults to `false`
 
-  * `:lstrip` (default `false`) - defines whether this token should strip all potential
-    whitespaces on its left side.
-    If `true`, this token will greedily match any whitespace on its left.
-    For example if we try to match the token `[MASK]` with `lstrip=true`,
-    in the text `"I saw a [MASK]"`, we would match on `" [MASK]"`. (Note the space on the left).
+    * `:rstrip` - defines whether this token should strip all potential
+      whitespaces on its right side. If `true`, this token will greedily
+      match any whitespace on its right. It works just like `:lstrip`,
+      but on the right. Defaults to `false`
 
-  * `:rstrip` (default `false`) - defines whether this token should strip all potential
-    whitespaces on its right side.
-    If `true`, this token will greedily match any whitespace on its right.
-    It works just like `lstrip` but on the right.
+    * `:normalized` - defines whether this token should match against
+      the normalized version of the input text. For example, with the
+      added token `"yesterday"`, and a normalizer in charge of
+      lowercasing the text, the token could be extract from the input
+      `"I saw a lion Yesterday"`. If `true`, the token will be extracted
+      from the normalized input `"i saw a lion yesterday"`. If `false`,
+      the token will be extracted from the original input
+      `"I saw a lion Yesterday"`. Defaults to `false` for special tokens
+      and `true` otherwise
 
-  * `:normalized` (default `true` for not special tokens, `false` for special tokens) -
-    defines whether this token should match against the normalized version of the input text.
-    For example, with the added token `"yesterday"`,
-    and a normalizer in charge of lowercasing the text,
-    the token could be extract from the input `"I saw a lion Yesterday"`.
-    If `true`, the token will be extracted from the normalized input `"i saw a lion yesterday"`.
-    If `false`, the token will be extracted from the original input `"I saw a lion Yesterday"`.
   """
-  @spec new(token :: String.t(), opts :: opts()) :: t()
+  @spec new(token :: String.t(), keyword()) :: t()
   defdelegate new(token, opts \\ []), to: Tokenizers.Native, as: :added_token_new
 
   @doc """

diff --git a/lib/tokenizers/decoder.ex b/lib/tokenizers/decoder.ex
@@ -1,10 +1,16 @@
 defmodule Tokenizers.Decoder do
   @moduledoc """
-  The Decoder knows how to go from the IDs used by the Tokenizer, back to a readable piece of text.
-  Some Normalizer and PreTokenizer use special characters or identifiers that need to be reverted.
+  Decoders and decoding functions.
+
+  Decoder transforms a sequence of token ids back to a readable piece
+  of text.
+
+  Some normalizers and pre-tokenizers use special characters or
+  identifiers that need special logic to be reverted.
   """
 
   defstruct [:resource]
+
   @type t() :: %__MODULE__{resource: reference()}
 
   @doc """
@@ -13,113 +19,104 @@ defmodule Tokenizers.Decoder do
   @spec decode(t(), [String.t()]) :: {:ok, String.t()} | {:error, any()}
   defdelegate decode(decoder, tokens), to: Tokenizers.Native, as: :decoders_decode
 
-  @typedoc """
-  Options for BPE decoder initialization. All options can be ommited.
+  @doc """
+  Creates a BPE decoder.
 
-  * `suffix` - The suffix to add to the end of each word, defaults to `</w>`
-  """
-  @type bpe_options :: [suffix: String.t()]
+  ## Options
+
+    * `suffix` - the suffix to add to the end of each word. Defaults
+      to `</w>`
 
-  @doc """
-  Creates new BPE decoder
   """
-  @spec bpe(bpe_options :: bpe_options()) :: t()
-  defdelegate bpe(options \\ []), to: Tokenizers.Native, as: :decoders_bpe
+  @spec bpe(keyword()) :: t()
+  defdelegate bpe(opts \\ []), to: Tokenizers.Native, as: :decoders_bpe
 
   @doc """
-  Creates new ByteFallback decoder
+  Creates a ByteFallback decoder.
   """
   @spec byte_fallback() :: t()
   defdelegate byte_fallback(), to: Tokenizers.Native, as: :decoders_byte_fallback
 
   @doc """
-  Creates new ByteLevel decoder
+  Creates a ByteLevel decoder.
   """
   @spec byte_level() :: t()
   defdelegate byte_level(), to: Tokenizers.Native, as: :decoders_byte_level
 
-  @typedoc """
-  Options for CTC decoder initialization. All options can be ommited.
+  @doc """
+  Creates a CTC decoder.
 
-  * `pad_token` - The token used for padding, defaults to `<pad>`
-  * `word_delimiter_token` - The token used for word delimiter, defaults to `|`
-  * `cleanup` - Whether to cleanup tokenization artifacts, defaults to `true`
-  """
-  @type ctc_options :: [
-          pad_token: String.t(),
-          word_delimiter_token: String.t(),
-          cleanup: boolean()
-        ]
+  ## Options
+
+    * `pad_token` - the token used for padding. Defaults to `<pad>`
+
+    * `word_delimiter_token` - the token used for word delimiter.
+      Defaults to `|`
+
+    * `cleanup` - whether to cleanup tokenization artifacts, defaults
+      to `true`
 
-  @doc """
-  Creates new CTC decoder
   """
-  @spec ctc(ctc_options :: ctc_options()) :: t()
-  defdelegate ctc(options \\ []), to: Tokenizers.Native, as: :decoders_ctc
+  @spec ctc(keyword()) :: t()
+  defdelegate ctc(opts \\ []), to: Tokenizers.Native, as: :decoders_ctc
 
   @doc """
-  Creates new Fuse decoder
+  Creates a Fuse decoder.
   """
   @spec fuse :: t()
   defdelegate fuse(), to: Tokenizers.Native, as: :decoders_fuse
 
-  @typedoc """
-  Options for Metaspace decoder initialization. All options can be ommited.
+  @doc """
+  Creates a Metaspace decoder.
+
+  ## Options
 
-  * `replacement` - The replacement character, defaults to `▁` (as char)
-  * `add_prefix_space` - Whether to add a space to the first word, defaults to `true`
-  """
+    * `replacement` - the replacement character. Defaults to `▁`
+      (as char)
 
-  @type metaspace_options :: [
-          replacement: char(),
-          add_prefix_space: boolean()
-        ]
+    * `add_prefix_space` - whether to add a space to the first word.
+      Defaults to `true`
 
-  @doc """
-  Creates new Metaspace decoder
   """
-  @spec metaspace(metaspace_options :: metaspace_options()) :: t()
-  defdelegate metaspace(options \\ []),
+  @spec metaspace(keyword()) :: t()
+  defdelegate metaspace(opts \\ []),
     to: Tokenizers.Native,
     as: :decoders_metaspace
 
   @doc """
-  Creates new Replace decoder
+  Creates a Replace decoder.
   """
-  @spec replace(pattern :: String.t(), content :: String.t()) :: t()
+  @spec replace(String.t(), String.t()) :: t()
   defdelegate replace(pattern, content), to: Tokenizers.Native, as: :decoders_replace
 
   @doc """
-  Creates new Sequence decoder
+  Combines a list of decoders into a single sequential decoder.
   """
   @spec sequence(decoders :: [t()]) :: t()
   defdelegate sequence(decoders), to: Tokenizers.Native, as: :decoders_sequence
 
   @doc """
-  Creates new Strip decoder.
+  Creates a Strip decoder.
 
   It expects a character and the number of times to strip the
   character on `left` and `right` sides.
   """
-  @spec strip(content :: char(), left :: non_neg_integer(), right :: non_neg_integer()) :: t()
+  @spec strip(char(), non_neg_integer(), non_neg_integer()) :: t()
   defdelegate strip(content, left, right), to: Tokenizers.Native, as: :decoders_strip
 
-  @typedoc """
-  Options for WordPiece decoder initialization. All options can be ommited.
+  @doc """
+  Creates a WordPiece decoder.
 
-  * `prefix` - The prefix to use for subwords, defaults to `##`
-  * `cleanup` - Whether to cleanup tokenization artifacts, defaults to `true`
-  """
-  @type word_piece_options :: [
-          prefix: String.t(),
-          cleanup: boolean()
-        ]
+  ## Options
+
+    * `prefix` - The prefix to use for subwords. Defaults to `##`
+
+    * `cleanup` - Whether to cleanup tokenization artifacts. Defaults
+      to `true`
 
-  @doc """
-  Creates new WordPiece decoder
   """
-  @spec word_piece(word_piece_options :: word_piece_options()) :: t()
-  defdelegate word_piece(options \\ []),
+  @spec word_piece(keyword()) :: t()
+  defdelegate word_piece(opts \\ []),
     to: Tokenizers.Native,
     as: :decoders_wordpiece
 end