Skip to content

Commit

Permalink
hardcoded sanity-max state size for case-insensitive matching
Browse files Browse the repository at this point in the history
currently 65536, quite high but at least within what most modern
machines can deal with

Also, delete FSTProcessor.current_state since confusingly all the
processors (except transliteration) make a local State called
current_state

Should help a bit against #167
  • Loading branch information
unhammer committed Oct 14, 2022
1 parent 007f8c7 commit 797829a
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 19 deletions.
24 changes: 12 additions & 12 deletions lttoolbox/fst_processor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -903,7 +903,7 @@ FSTProcessor::compoundAnalysis(UString input_word)
{
UChar val=input_word[i];

current_state.step_case(val, caseSensitive);
current_state.step_case(val, beCaseSensitive(current_state));

if(current_state.size() > MAX_COMBINATIONS)
{
Expand Down Expand Up @@ -1068,7 +1068,7 @@ FSTProcessor::analysis(InputFile& input, UFILE *output)
{
rcx_map_ptr = rcx_map.find(val);
std::set<int> tmpset = rcx_map_ptr->second;
if(!u_isupper(val) || caseSensitive)
if(!u_isupper(val) || beCaseSensitive(current_state))
{
current_state.step(val, tmpset);
}
Expand All @@ -1087,7 +1087,7 @@ FSTProcessor::analysis(InputFile& input, UFILE *output)
}
else
{
current_state.step_case(val, caseSensitive);
current_state.step_case(val, beCaseSensitive(current_state));
}

if(current_state.size() != 0)
Expand Down Expand Up @@ -1580,7 +1580,7 @@ FSTProcessor::generation(InputFile& input, UFILE *output, GenerationMode mode)
alphabet.getSymbol(sf,val);
if(current_state.size() > 0)
{
if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive)
if(!alphabet.isTag(val) && u_isupper(val) && !(beCaseSensitive(current_state)))
{
if(mode == gm_carefulcase)
{
Expand Down Expand Up @@ -1621,7 +1621,7 @@ FSTProcessor::transliteration(InputFile& input, UFILE *output)
size_t cur_word = 0;
size_t cur_pos = 0;
size_t match_pos = 0;
current_state = initial_state;
State current_state = initial_state;
UString last_match;
int space_diff = 0;

Expand Down Expand Up @@ -1712,7 +1712,7 @@ FSTProcessor::transliteration(InputFile& input, UFILE *output)
}
}

current_state.step_case_override(sym, caseSensitive);
current_state.step_case_override(sym, beCaseSensitive(current_state));

if (current_state.size() == 0 || is_end) {
if (last_match.empty()) {
Expand Down Expand Up @@ -1866,7 +1866,7 @@ FSTProcessor::biltransfull(UStringView input_word, bool with_delim)
}
if(current_state.size() != 0)
{
if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive)
if(!alphabet.isTag(val) && u_isupper(val) && !beCaseSensitive(current_state))
{
current_state.step(val, u_tolower(val));
}
Expand Down Expand Up @@ -2019,7 +2019,7 @@ FSTProcessor::biltrans(UStringView input_word, bool with_delim)
}
if(current_state.size() != 0)
{
if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive)
if(!alphabet.isTag(val) && u_isupper(val) && !beCaseSensitive(current_state))
{
current_state.step(val, u_tolower(val));
}
Expand Down Expand Up @@ -2277,7 +2277,7 @@ FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode)
}
if(current_state.size() != 0)
{
current_state.step_case(val, caseSensitive);
current_state.step_case(val, beCaseSensitive(current_state));
}
if(current_state.isFinal(all_finals))
{
Expand Down Expand Up @@ -2376,7 +2376,7 @@ FSTProcessor::biltransWithQueue(UStringView input_word, bool with_delim)
}
if(current_state.size() != 0)
{
if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive)
if(!alphabet.isTag(val) && u_isupper(val) && !beCaseSensitive(current_state))
{
current_state.step(val, u_tolower(val));
}
Expand Down Expand Up @@ -2541,7 +2541,7 @@ FSTProcessor::biltransWithoutQueue(UStringView input_word, bool with_delim)
}
if(current_state.size() != 0)
{
if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive)
if(!alphabet.isTag(val) && u_isupper(val) && !beCaseSensitive(current_state))
{
current_state.step(val, u_tolower(val));
}
Expand Down Expand Up @@ -2744,7 +2744,7 @@ FSTProcessor::SAO(InputFile& input, UFILE *output)
last = input_buffer.getPos();
}

current_state.step_case(val, caseSensitive);
current_state.step_case(val, beCaseSensitive(current_state));

if(current_state.size() != 0)
{
Expand Down
18 changes: 13 additions & 5 deletions lttoolbox/fst_processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,6 @@ class FSTProcessor
*/
std::map<UString, TransExe> transducers;

/**
* Current state of lexical analysis
*/
State current_state;

/**
* Initial state of every token
*/
Expand Down Expand Up @@ -443,6 +438,19 @@ class FSTProcessor
bool isLastBlankTM = false;

xmlTextReaderPtr reader;

static constexpr size_t max_case_insensitive_state_size = 65536;
/*
* Including lowercased versions for every character can potentially create very large states
* (See https://github.com/apertium/lttoolbox/issues/167 ). As a sanity-check we don't do
* case-insensitive matching if the state size exceeds max_case_insensitive_state_size.
*
* @return running with --case-sensitive or state size exceeds max
*/
bool beCaseSensitive(const State& state) {
return caseSensitive || state.size() >= max_case_insensitive_state_size;
}

public:

/*
Expand Down
2 changes: 1 addition & 1 deletion lttoolbox/state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ State::copy(State const &s)
}
}

int
size_t
State::size() const
{
return state.size();
Expand Down
2 changes: 1 addition & 1 deletion lttoolbox/state.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ class State
* Number of alive transductions
* @return the size
*/
int size() const;
size_t size() const;

/**
* step = apply + epsilonClosure
Expand Down

0 comments on commit 797829a

Please sign in to comment.