feat: Added onnx and tokenizer files support script (#119)

* Updated save_pretrained * Added onnx export script * Fixed tests, fixed bug with package extras * Moved function * Moved function * Updated tokenizer logic * Update * Updated readme * Updated readme * Updated readme
MinishLab · Nov 1, 2024 · 54a6460 · 54a6460
1 parent 6022837
commit 54a6460
Show file tree

Hide file tree

Showing 4 changed files with 348 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -54,6 +54,7 @@ Model2Vec is a technique to turn any sentence transformer into a really small st
     - [Distillation](#distillation)
     - [Inference](#inference)
     - [Evaluation](#evaluation)
+    - [Integrations](#integrations)
 - [Model List](#model-list)
 - [Results](#results)
 - [Related Work](#related-work)
@@ -356,6 +357,89 @@ print(make_leaderboard(task_scores))
 ```
 </details>
 
+### Integrations
+<details>
+<summary>  Sentence Transformers </summary>
+<br>
+
+Model2Vec can be used directly in [Sentence Transformers](https://github.com/UKPLab/sentence-transformers) using the `StaticEmbedding` module.
+
+The following code snippet shows how to load a Model2Vec model into a Sentence Transformer model:
+```python
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.models import StaticEmbedding
+
+# Initialize a StaticEmbedding module
+static_embedding = StaticEmbedding.from_model2vec("minishlab/potion-base-8M")
+model = SentenceTransformer(modules=[static_embedding])
+embeddings = model.encode(["It's dangerous to go alone!", "It's a secret to everybody."])
+```
+
+The following code snippet shows how to distill a model directly into a Sentence Transformer model:
+
+```python
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.models import StaticEmbedding
+
+static_embedding = StaticEmbedding.from_distillation("BAAI/bge-base-en-v1.5", device="cpu", pca_dims=256)
+model = SentenceTransformer(modules=[static_embedding])
+embeddings = model.encode(["It's dangerous to go alone!", "It's a secret to everybody."])
+```
+
+</details>
+
+
+<details>
+<summary>  Transformers.js </summary>
+
+<br>
+
+To use a Model2Vec model in [transformers.js](https://github.com/huggingface/transformers.js), the following code snippet can be used as a starting point:
+
+```javascript
+import { AutoModel, AutoTokenizer, Tensor } from '@huggingface/transformers';
+
+const modelName = 'minishlab/potion-base-8M';
+
+const modelConfig = {
+    config: { model_type: 'model2vec' },
+    dtype: 'fp32',
+    revision: 'refs/pr/1'
+};
+const tokenizerConfig = {
+    revision: 'refs/pr/2'
+};
+
+const model = await AutoModel.from_pretrained(modelName, modelConfig);
+const tokenizer = await AutoTokenizer.from_pretrained(modelName, tokenizerConfig);
+
+const texts = ['hello', 'hello world'];
+const { input_ids } = await tokenizer(texts, { add_special_tokens: false, return_tensor: false });
+
+const cumsum = arr => arr.reduce((acc, num, i) => [...acc, num + (acc[i - 1] || 0)], []);
+const offsets = [0, ...cumsum(input_ids.slice(0, -1).map(x => x.length))];
+
+const flattened_input_ids = input_ids.flat();
+const modelInputs = {
+    input_ids: new Tensor('int64', flattened_input_ids, [flattened_input_ids.length]),
+    offsets: new Tensor('int64', offsets, [offsets.length])
+};
+
+const { embeddings } = await model(modelInputs);
+console.log(embeddings.tolist()); // output matches python version
+```
+
+Note that this requires that the Model2Vec has a `model.onnx` file and several required tokenizers file. To generate these for a model that does not have them yet, the following code snippet can be used:
+
+```bash
+python scripts/export_to_onnx.py --model_path <path-to-a-model2vec-model> --save_path "<path-to-save-the-onnx-model>"
+```
+
+
+<br>
+</details>
+
+
 ## Model List
 
 We provide a number of models that can be used out of the box. These models are available on the [HuggingFace hub](https://huggingface.co/collections/minishlab/model2vec-base-models-66fd9dd9b7c3b3c0f25ca90e) and can be loaded using the `from_pretrained` method. The models are listed below.

diff --git a/pyproject.toml b/pyproject.toml
@@ -54,6 +54,8 @@ dev = [
 ]
 distill = ["torch", "transformers", "scikit-learn"]
 
+onnx = ["onnx", "torch"]
+
 [project.urls]
 "Homepage" = "https://github.com/MinishLab"
 "Bug Reports" = "https://github.com/MinishLab/model2vec/issues"

diff --git a/scripts/export_to_onnx.py b/scripts/export_to_onnx.py
@@ -0,0 +1,208 @@
+from model2vec.utils import get_package_extras, importable
+
+# Define the optional dependency group name
+_REQUIRED_EXTRA = "onnx"
+
+# Check if each dependency for the "onnx" group is importable
+for extra_dependency in get_package_extras("model2vec", _REQUIRED_EXTRA):
+    importable(extra_dependency, _REQUIRED_EXTRA)
+
+import argparse
+import json
+import logging
+from pathlib import Path
+
+import torch
+from tokenizers import Tokenizer
+from transformers import AutoTokenizer, PreTrainedTokenizerFast
+
+from model2vec import StaticModel
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class TorchStaticModel(torch.nn.Module):
+    def __init__(self, model: StaticModel) -> None:
+        """Initialize the TorchStaticModel with a StaticModel instance."""
+        super().__init__()
+        # Convert NumPy embeddings to a torch.nn.EmbeddingBag
+        embeddings = torch.tensor(model.embedding, dtype=torch.float32)
+        self.embedding_bag = torch.nn.EmbeddingBag.from_pretrained(embeddings, mode="mean", freeze=True)
+        self.normalize = model.normalize
+        # Save tokenizer attributes
+        self.tokenizer = model.tokenizer
+        self.unk_token_id = model.unk_token_id
+        self.median_token_length = model.median_token_length
+
+    def forward(self, input_ids: torch.Tensor, offsets: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the model.
+
+        :param input_ids: The input token ids.
+        :param offsets: The offsets to compute the mean pooling.
+        :return: The embeddings.
+        """
+        # Perform embedding lookup and mean pooling
+        embeddings = self.embedding_bag(input_ids, offsets)
+        # Normalize if required
+        if self.normalize:
+            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=-1)
+        return embeddings
+
+    def tokenize(self, sentences: list[str], max_length: int | None = None) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Tokenize the input sentences.
+
+        :param sentences: The input sentences.
+        :param max_length: The maximum length of the input_ids.
+        :return: The input_ids and offsets.
+        """
+        # Tokenization logic similar to your StaticModel
+        if max_length is not None:
+            m = max_length * self.median_token_length
+            sentences = [sentence[:m] for sentence in sentences]
+        encodings = self.tokenizer.encode_batch(sentences, add_special_tokens=False)
+        encodings_ids = [encoding.ids for encoding in encodings]
+        if self.unk_token_id is not None:
+            # Remove unknown tokens
+            encodings_ids = [
+                [token_id for token_id in token_ids if token_id != self.unk_token_id] for token_ids in encodings_ids
+            ]
+        if max_length is not None:
+            encodings_ids = [token_ids[:max_length] for token_ids in encodings_ids]
+        # Flatten input_ids and compute offsets
+        offsets = torch.tensor([0] + [len(ids) for ids in encodings_ids[:-1]], dtype=torch.long).cumsum(dim=0)
+        input_ids = torch.tensor(
+            [token_id for token_ids in encodings_ids for token_id in token_ids],
+            dtype=torch.long,
+        )
+        return input_ids, offsets
+
+
+def export_model_to_onnx(model_path: str, save_path: Path) -> None:
+    """
+    Export the StaticModel to ONNX format and save tokenizer files.
+
+    :param model_path: The path to the pretrained StaticModel.
+    :param save_path: The directory to save the model and related files.
+    """
+    save_path.mkdir(parents=True, exist_ok=True)
+
+    # Load the StaticModel
+    model = StaticModel.from_pretrained(model_path)
+    torch_model = TorchStaticModel(model)
+
+    # Save the model using save_pretrained
+    model.save_pretrained(save_path)
+
+    # Prepare dummy input data
+    texts = ["hello", "hello world"]
+    input_ids, offsets = torch_model.tokenize(texts)
+
+    # Export the model to ONNX
+    onnx_model_path = save_path / "onnx/model.onnx"
+    onnx_model_path.parent.mkdir(parents=True, exist_ok=True)
+    torch.onnx.export(
+        torch_model,
+        (input_ids, offsets),
+        str(onnx_model_path),
+        export_params=True,
+        opset_version=14,
+        do_constant_folding=True,
+        input_names=["input_ids", "offsets"],
+        output_names=["embeddings"],
+        dynamic_axes={
+            "input_ids": {0: "num_tokens"},
+            "offsets": {0: "batch_size"},
+            "embeddings": {0: "batch_size"},
+        },
+    )
+
+    logger.info(f"Model has been successfully exported to {onnx_model_path}")
+
+    # Save the tokenizer files required for transformers.js
+    save_tokenizer(model.tokenizer, save_path)
+    logger.info(f"Tokenizer files have been saved to {save_path}")
+
+
+def save_tokenizer(tokenizer: Tokenizer, save_directory: Path) -> None:
+    """
+    Save tokenizer files in a format compatible with Transformers.
+
+    :param tokenizer: The tokenizer from the StaticModel.
+    :param save_directory: The directory to save the tokenizer files.
+    :raises FileNotFoundError: If config.json is not found in save_directory.
+    :raises FileNotFoundError: If tokenizer_config.json is not found in save_directory.
+    :raises ValueError: If tokenizer_name is not found in config.json.
+    """
+    tokenizer_json_path = save_directory / "tokenizer.json"
+    tokenizer.save(str(tokenizer_json_path))
+
+    # Save vocab.txt
+    vocab = tokenizer.get_vocab()
+    vocab_path = save_directory / "vocab.txt"
+    with open(vocab_path, "w", encoding="utf-8") as vocab_file:
+        for token in sorted(vocab, key=vocab.get):
+            vocab_file.write(f"{token}\n")
+
+    # Load config.json to get tokenizer_name
+    config_path = save_directory / "config.json"
+    if config_path.exists():
+        with open(config_path, "r", encoding="utf-8") as f:
+            config = json.load(f)
+    else:
+        raise FileNotFoundError(f"config.json not found in {save_directory}")
+
+    tokenizer_name = config.get("tokenizer_name")
+    if not tokenizer_name:
+        raise ValueError("tokenizer_name not found in config.json")
+
+    # Load the original tokenizer
+    original_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+
+    # Extract special tokens and tokenizer class
+    special_tokens = original_tokenizer.special_tokens_map
+    tokenizer_class = original_tokenizer.__class__.__name__
+
+    # Load the tokenizer using PreTrainedTokenizerFast with special tokens
+    fast_tokenizer = PreTrainedTokenizerFast(
+        tokenizer_file=str(tokenizer_json_path),
+        **special_tokens,
+    )
+
+    # Save the tokenizer files
+    fast_tokenizer.save_pretrained(str(save_directory))
+    # Modify tokenizer_config.json to set the correct tokenizer_class
+    tokenizer_config_path = save_directory / "tokenizer_config.json"
+    if tokenizer_config_path.exists():
+        with open(tokenizer_config_path, "r", encoding="utf-8") as f:
+            tokenizer_config = json.load(f)
+    else:
+        raise FileNotFoundError(f"tokenizer_config.json not found in {save_directory}")
+
+    # Update the tokenizer_class field
+    tokenizer_config["tokenizer_class"] = tokenizer_class
+
+    # Write the updated tokenizer_config.json back to disk
+    with open(tokenizer_config_path, "w", encoding="utf-8") as f:
+        json.dump(tokenizer_config, f, indent=4, sort_keys=True)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Export StaticModel to ONNX format")
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        required=True,
+        help="Path to the pretrained StaticModel",
+    )
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        required=True,
+        help="Directory to save the exported model and files",
+    )
+    args = parser.parse_args()
+
+    export_model_to_onnx(args.model_path, Path(args.save_path))