Skip to content

Commit

Permalink
Support UniDic format for MeCabTokenizer (#132)
Browse files Browse the repository at this point in the history
* add parsing method for unidic

* remove test for parse_feature_for_unidic

* modify condition of dictionary format inference

* Revert "remove test for parse_feature_for_unidic"

This reverts commit 4986db7.
  • Loading branch information
altescy authored May 23, 2021
1 parent 9e87f78 commit d1c1787
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 0 deletions.
41 changes: 41 additions & 0 deletions konoha/word_tokenizers/mecab_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,43 @@ def parse_feature_for_ipadic(elem) -> Token:
)


def parse_feature_for_unidic(elem) -> Token:
"""
UniDic format: https://unidic.ninjal.ac.jp/faq
"""
surface, feaature_line = elem.split("\t")
features = feaature_line.split(",")

postag = features[0] or None
postag2 = features[1] or None
postag3 = features[2] or None
postag4 = features[3] or None
inflection = features[4] or None
conjugation = features[5] or None

if len(features) >= 10:
yomi = features[6] or None
base_form = features[7] or None
pron = features[9] or None
else:
yomi = None
base_form = None
pron = None

return Token(
surface=surface,
postag=postag,
postag2=postag2,
postag3=postag3,
postag4=postag4,
inflection=inflection,
conjugation=conjugation,
base_form=base_form,
yomi=yomi,
pron=pron,
)


class MeCabTokenizer(BaseTokenizer):
"""Wrapper class forexternal text analyzers"""

Expand Down Expand Up @@ -84,12 +121,16 @@ def __init__(
if dictionary_format is None:
if system_dictionary_path is None or "ipadic" in system_dictionary_path.lower():
self._parse_feature = parse_feature_for_ipadic
elif "unidic" in system_dictionary_path.lower():
self._parse_feature = parse_feature_for_unidic
else:
raise ValueError(f"Unsupported system dictionary: {system_dictionary_path}")

else:
if "ipadic" == dictionary_format.lower():
self._parse_feature = parse_feature_for_ipadic
elif "unidic" == dictionary_format.lower():
self._parse_feature = parse_feature_for_unidic
else:
raise ValueError(f"Unsupported dictionary format: {dictionary_format}")

Expand Down
35 changes: 35 additions & 0 deletions tests/word_tokenizer/test_mecab_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from konoha.konoha_token import Token
from konoha.word_tokenizer import WordTokenizer
from konoha.word_tokenizers.mecab_tokenizer import parse_feature_for_unidic


def test_word_tokenize_with_mecab():
Expand Down Expand Up @@ -138,3 +139,37 @@ def test_word_tokenize_with_s3_system_dictionary():
expect = [Token(surface=w) for w in "吾輩 は 猫 で ある".split(" ")]
result = tokenizer.tokenize("吾輩は猫である")
assert expect == result


def test_parse_feature_for_unidic():
token = parse_feature_for_unidic(
"吾輩\t代名詞,,,,,,ワガハイ,我が輩,吾輩,ワガハイ,吾輩,ワガハイ,混,,,,,,,体,ワガハイ,ワガハイ,ワガハイ,ワガハイ,0,,,11321954766299648,41189"
)
result = (
token.surface,
token.postag,
token.postag2,
token.postag3,
token.postag4,
token.inflection,
token.conjugation,
token.base_form,
token.normalized_form,
token.yomi,
token.pron,
)
expect = (
"吾輩",
"代名詞",
None,
None,
None,
None,
None,
"我が輩",
None,
"ワガハイ",
"ワガハイ",
)

assert result == expect

0 comments on commit d1c1787

Please sign in to comment.