diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index 1466ffb0..03ef8568 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -903,7 +903,7 @@ FSTProcessor::compoundAnalysis(UString input_word) { UChar val=input_word[i]; - current_state.step_case(val, caseSensitive); + current_state.step_case(val, beCaseSensitive(current_state)); if(current_state.size() > MAX_COMBINATIONS) { @@ -1068,7 +1068,7 @@ FSTProcessor::analysis(InputFile& input, UFILE *output) { rcx_map_ptr = rcx_map.find(val); std::set tmpset = rcx_map_ptr->second; - if(!u_isupper(val) || caseSensitive) + if(!u_isupper(val) || beCaseSensitive(current_state)) { current_state.step(val, tmpset); } @@ -1087,7 +1087,7 @@ FSTProcessor::analysis(InputFile& input, UFILE *output) } else { - current_state.step_case(val, caseSensitive); + current_state.step_case(val, beCaseSensitive(current_state)); } if(current_state.size() != 0) @@ -1580,7 +1580,7 @@ FSTProcessor::generation(InputFile& input, UFILE *output, GenerationMode mode) alphabet.getSymbol(sf,val); if(current_state.size() > 0) { - if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !(beCaseSensitive(current_state))) { if(mode == gm_carefulcase) { @@ -1621,7 +1621,7 @@ FSTProcessor::transliteration(InputFile& input, UFILE *output) size_t cur_word = 0; size_t cur_pos = 0; size_t match_pos = 0; - current_state = initial_state; + State current_state = initial_state; UString last_match; int space_diff = 0; @@ -1712,7 +1712,7 @@ FSTProcessor::transliteration(InputFile& input, UFILE *output) } } - current_state.step_case_override(sym, caseSensitive); + current_state.step_case_override(sym, beCaseSensitive(current_state)); if (current_state.size() == 0 || is_end) { if (last_match.empty()) { @@ -1866,7 +1866,7 @@ FSTProcessor::biltransfull(UStringView input_word, bool with_delim) } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !beCaseSensitive(current_state)) { current_state.step(val, u_tolower(val)); } @@ -2019,7 +2019,7 @@ FSTProcessor::biltrans(UStringView input_word, bool with_delim) } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !beCaseSensitive(current_state)) { current_state.step(val, u_tolower(val)); } @@ -2277,7 +2277,7 @@ FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode) } if(current_state.size() != 0) { - current_state.step_case(val, caseSensitive); + current_state.step_case(val, beCaseSensitive(current_state)); } if(current_state.isFinal(all_finals)) { @@ -2376,7 +2376,7 @@ FSTProcessor::biltransWithQueue(UStringView input_word, bool with_delim) } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !beCaseSensitive(current_state)) { current_state.step(val, u_tolower(val)); } @@ -2541,7 +2541,7 @@ FSTProcessor::biltransWithoutQueue(UStringView input_word, bool with_delim) } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !beCaseSensitive(current_state)) { current_state.step(val, u_tolower(val)); } @@ -2744,7 +2744,7 @@ FSTProcessor::SAO(InputFile& input, UFILE *output) last = input_buffer.getPos(); } - current_state.step_case(val, caseSensitive); + current_state.step_case(val, beCaseSensitive(current_state)); if(current_state.size() != 0) { diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h index c4109180..a0787996 100644 --- a/lttoolbox/fst_processor.h +++ b/lttoolbox/fst_processor.h @@ -59,11 +59,6 @@ class FSTProcessor */ std::map transducers; - /** - * Current state of lexical analysis - */ - State current_state; - /** * Initial state of every token */ @@ -443,6 +438,19 @@ class FSTProcessor bool isLastBlankTM = false; xmlTextReaderPtr reader; + + static constexpr size_t max_case_insensitive_state_size = 65536; + /* + * Including lowercased versions for every character can potentially create very large states + * (See https://github.com/apertium/lttoolbox/issues/167 ). As a sanity-check we don't do + * case-insensitive matching if the state size exceeds max_case_insensitive_state_size. + * + * @return running with --case-sensitive or state size exceeds max + */ + bool beCaseSensitive(const State& state) { + return caseSensitive || state.size() >= max_case_insensitive_state_size; + } + public: /* diff --git a/lttoolbox/state.cc b/lttoolbox/state.cc index 4e03d008..1ebed82f 100644 --- a/lttoolbox/state.cc +++ b/lttoolbox/state.cc @@ -81,7 +81,7 @@ State::copy(State const &s) } } -int +size_t State::size() const { return state.size(); diff --git a/lttoolbox/state.h b/lttoolbox/state.h index d9b67199..1efef6bd 100644 --- a/lttoolbox/state.h +++ b/lttoolbox/state.h @@ -165,7 +165,7 @@ class State * Number of alive transductions * @return the size */ - int size() const; + size_t size() const; /** * step = apply + epsilonClosure