From 62aed6712870fa69a07212982fd26564d09ec721 Mon Sep 17 00:00:00 2001 From: Michael Ruoss Date: Sun, 21 Apr 2024 13:32:22 +0200 Subject: [PATCH] add support for regular expressions in Tokenizers.Normalizer.replace/2 --- CHANGELOG.md | 9 +++++++++ lib/tokenizers/normalizer.ex | 19 +++++++++++++++---- native/ex_tokenizers/src/normalizers.rs | 19 ++++++++++++++++--- test/tokenizers/normalizer_test.exs | 24 ++++++++++++++++++++++++ 4 files changed, 64 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cca2a0d..30a5589 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,15 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## Unreleased + +### Added + +- Support for regular expressions to replace normalizer. See + `Tokenizers.Normalizer.replace_regex/2`. +- Support for regular expressions to split pre-tokenizer. See + `Tokenizers.PreTokenizer.split_regex/3`. + ## [v0.4.0] - 2023-08-09 ### Added diff --git a/lib/tokenizers/normalizer.ex b/lib/tokenizers/normalizer.ex index b309f1d..0a2356c 100644 --- a/lib/tokenizers/normalizer.ex +++ b/lib/tokenizers/normalizer.ex @@ -117,12 +117,23 @@ defmodule Tokenizers.Normalizer do defdelegate lowercase(), to: Tokenizers.Native, as: :normalizers_lowercase @doc """ - Replaces a custom string or regexp and changes it with given content. + Replaces a custom `search` string with the given `content`. """ @spec replace(String.t(), String.t()) :: t() - defdelegate replace(pattern, content), - to: Tokenizers.Native, - as: :normalizers_replace + def replace(search, content) do + Tokenizers.Native.normalizers_replace({:string, search}, content) + end + + @doc """ + Replaces occurrences of a custom regexp `pattern` with the given `content`. + + The `pattern` should be a string representing a regular expression + according to the [Oniguruma Regex Engine](https://github.com/kkos/oniguruma). + """ + @spec replace_regex(String.t(), String.t()) :: t() + def replace_regex(pattern, content) do + Tokenizers.Native.normalizers_replace({:regex, pattern}, content) + end @doc """ Creates a Nmt normalizer. diff --git a/native/ex_tokenizers/src/normalizers.rs b/native/ex_tokenizers/src/normalizers.rs index f50c109..307e1f5 100644 --- a/native/ex_tokenizers/src/normalizers.rs +++ b/native/ex_tokenizers/src/normalizers.rs @@ -1,7 +1,9 @@ use crate::{new_info, util::Info, ExTokenizersError}; use rustler::NifTaggedEnum; use serde::{Deserialize, Serialize}; -use tokenizers::{NormalizedString, Normalizer, NormalizerWrapper}; +use tokenizers::{ + normalizers::replace::ReplacePattern, NormalizedString, Normalizer, NormalizerWrapper, +}; pub struct ExTokenizersNormalizerRef(pub NormalizerWrapper); @@ -241,13 +243,24 @@ pub fn normalizers_lowercase() -> ExTokenizersNormalizer { ExTokenizersNormalizer::new(tokenizers::normalizers::utils::Lowercase) } +#[derive(NifTaggedEnum)] +pub enum LocalReplacePattern { + String(String), + Regex(String), +} + #[rustler::nif] pub fn normalizers_replace( - pattern: String, + pattern: LocalReplacePattern, content: String, ) -> Result { + let final_pattern = match pattern { + LocalReplacePattern::String(pattern) => ReplacePattern::String(pattern), + LocalReplacePattern::Regex(pattern) => ReplacePattern::Regex(pattern), + }; + Ok(ExTokenizersNormalizer::new( - tokenizers::normalizers::replace::Replace::new(pattern, content) + tokenizers::normalizers::replace::Replace::new(final_pattern, content) .map_err(|_| rustler::Error::BadArg)?, )) } diff --git a/test/tokenizers/normalizer_test.exs b/test/tokenizers/normalizer_test.exs index 026a671..5341692 100644 --- a/test/tokenizers/normalizer_test.exs +++ b/test/tokenizers/normalizer_test.exs @@ -90,4 +90,28 @@ defmodule Tokenizers.NormalizerTest do {:ok, "▁Hello"} end end + + describe "Replace" do + test "can be initialized" do + assert %Tokenizers.Normalizer{} = Tokenizers.Normalizer.replace("find", "replace") + end + + test "can normalize strings" do + assert Tokenizers.Normalizer.replace("Hello", "World") + |> Tokenizers.Normalizer.normalize("Hello") == + {:ok, "World"} + end + end + + describe "Replace Regex" do + test "can be initialized" do + assert %Tokenizers.Normalizer{} = Tokenizers.Normalizer.replace_regex("\\d*", "") + end + + test "can normalize strings" do + assert Tokenizers.Normalizer.replace_regex("\\d*", "") + |> Tokenizers.Normalizer.normalize("1Hel2lo3") == + {:ok, "Hello"} + end + end end