Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add support for regular expressions in Tokenizers.Normalizer.replace/2 #56

Merged
merged 1 commit into from
Apr 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,15 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## Unreleased

### Added

- Support for regular expressions to replace normalizer. See
`Tokenizers.Normalizer.replace_regex/2`.
- Support for regular expressions to split pre-tokenizer. See
`Tokenizers.PreTokenizer.split_regex/3`.

## [v0.4.0] - 2023-08-09

### Added
Expand Down
19 changes: 15 additions & 4 deletions lib/tokenizers/normalizer.ex
Original file line number Diff line number Diff line change
Expand Up @@ -117,12 +117,23 @@ defmodule Tokenizers.Normalizer do
defdelegate lowercase(), to: Tokenizers.Native, as: :normalizers_lowercase

@doc """
Replaces a custom string or regexp and changes it with given content.
Replaces a custom `search` string with the given `content`.
"""
@spec replace(String.t(), String.t()) :: t()
defdelegate replace(pattern, content),
to: Tokenizers.Native,
as: :normalizers_replace
def replace(search, content) do
Tokenizers.Native.normalizers_replace({:string, search}, content)
end

@doc """
Replaces occurrences of a custom regexp `pattern` with the given `content`.

The `pattern` should be a string representing a regular expression
according to the [Oniguruma Regex Engine](https://github.com/kkos/oniguruma).
"""
@spec replace_regex(String.t(), String.t()) :: t()
def replace_regex(pattern, content) do
Tokenizers.Native.normalizers_replace({:regex, pattern}, content)
end

@doc """
Creates a Nmt normalizer.
Expand Down
19 changes: 16 additions & 3 deletions native/ex_tokenizers/src/normalizers.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
use crate::{new_info, util::Info, ExTokenizersError};
use rustler::NifTaggedEnum;
use serde::{Deserialize, Serialize};
use tokenizers::{NormalizedString, Normalizer, NormalizerWrapper};
use tokenizers::{
normalizers::replace::ReplacePattern, NormalizedString, Normalizer, NormalizerWrapper,
};

pub struct ExTokenizersNormalizerRef(pub NormalizerWrapper);

Expand Down Expand Up @@ -241,13 +243,24 @@ pub fn normalizers_lowercase() -> ExTokenizersNormalizer {
ExTokenizersNormalizer::new(tokenizers::normalizers::utils::Lowercase)
}

#[derive(NifTaggedEnum)]
pub enum LocalReplacePattern {
String(String),
Regex(String),
}

#[rustler::nif]
pub fn normalizers_replace(
pattern: String,
pattern: LocalReplacePattern,
content: String,
) -> Result<ExTokenizersNormalizer, rustler::Error> {
let final_pattern = match pattern {
LocalReplacePattern::String(pattern) => ReplacePattern::String(pattern),
LocalReplacePattern::Regex(pattern) => ReplacePattern::Regex(pattern),
};

Ok(ExTokenizersNormalizer::new(
tokenizers::normalizers::replace::Replace::new(pattern, content)
tokenizers::normalizers::replace::Replace::new(final_pattern, content)
.map_err(|_| rustler::Error::BadArg)?,
))
}
Expand Down
24 changes: 24 additions & 0 deletions test/tokenizers/normalizer_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -90,4 +90,28 @@ defmodule Tokenizers.NormalizerTest do
{:ok, "▁Hello"}
end
end

describe "Replace" do
test "can be initialized" do
assert %Tokenizers.Normalizer{} = Tokenizers.Normalizer.replace("find", "replace")
end

test "can normalize strings" do
assert Tokenizers.Normalizer.replace("Hello", "World")
|> Tokenizers.Normalizer.normalize("Hello") ==
{:ok, "World"}
end
end

describe "Replace Regex" do
test "can be initialized" do
assert %Tokenizers.Normalizer{} = Tokenizers.Normalizer.replace_regex("\\d*", "")
end

test "can normalize strings" do
assert Tokenizers.Normalizer.replace_regex("\\d*", "")
|> Tokenizers.Normalizer.normalize("1Hel2lo3") ==
{:ok, "Hello"}
end
end
end
Loading