Skip to content

Commit

Permalink
Use PunktTokenizer instead of nltk.data.load
Browse files Browse the repository at this point in the history
  • Loading branch information
vblagoje committed Aug 29, 2024
1 parent fb0abb6 commit 06399e8
Showing 1 changed file with 3 additions and 2 deletions.
5 changes: 3 additions & 2 deletions haystack/nodes/preprocessor/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

with LazyImport("Run 'pip install farm-haystack[preprocessing]' or 'pip install nltk'") as nltk_import:
import nltk
from nltk.tokenize.punkt import PunktTokenizer

iso639_to_nltk = {
"ru": "russian",
Expand Down Expand Up @@ -929,14 +930,14 @@ def _load_sentence_tokenizer(self, language_name: Optional[str]) -> "nltk.tokeni

# Use a default NLTK model
elif language_name is not None:
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/{language_name}.pickle")
sentence_tokenizer = PunktTokenizer(language_name)
else:
logger.error(
"PreProcessor couldn't find the default sentence tokenizer model for %s. "
" Using English instead. You may train your own model and use the 'tokenizer_model_folder' parameter.",
self.language,
)
sentence_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
sentence_tokenizer = PunktTokenizer() # default english model

return sentence_tokenizer

Expand Down

0 comments on commit 06399e8

Please sign in to comment.