diff --git a/konoha/word_tokenizers/mecab_tokenizer.py b/konoha/word_tokenizers/mecab_tokenizer.py index 4d71235..b2ef3b9 100644 --- a/konoha/word_tokenizers/mecab_tokenizer.py +++ b/konoha/word_tokenizers/mecab_tokenizer.py @@ -29,6 +29,43 @@ def parse_feature_for_ipadic(elem) -> Token: ) +def parse_feature_for_unidic(elem) -> Token: + """ + UniDic format: https://unidic.ninjal.ac.jp/faq + """ + surface, feaature_line = elem.split("\t") + features = feaature_line.split(",") + + postag = features[0] or None + postag2 = features[1] or None + postag3 = features[2] or None + postag4 = features[3] or None + inflection = features[4] or None + conjugation = features[5] or None + + if len(features) >= 10: + yomi = features[6] or None + base_form = features[7] or None + pron = features[9] or None + else: + yomi = None + base_form = None + pron = None + + return Token( + surface=surface, + postag=postag, + postag2=postag2, + postag3=postag3, + postag4=postag4, + inflection=inflection, + conjugation=conjugation, + base_form=base_form, + yomi=yomi, + pron=pron, + ) + + class MeCabTokenizer(BaseTokenizer): """Wrapper class forexternal text analyzers""" @@ -84,12 +121,16 @@ def __init__( if dictionary_format is None: if system_dictionary_path is None or "ipadic" in system_dictionary_path.lower(): self._parse_feature = parse_feature_for_ipadic + elif "unidic" in system_dictionary_path.lower(): + self._parse_feature = parse_feature_for_unidic else: raise ValueError(f"Unsupported system dictionary: {system_dictionary_path}") else: if "ipadic" == dictionary_format.lower(): self._parse_feature = parse_feature_for_ipadic + elif "unidic" == dictionary_format.lower(): + self._parse_feature = parse_feature_for_unidic else: raise ValueError(f"Unsupported dictionary format: {dictionary_format}") diff --git a/tests/word_tokenizer/test_mecab_tokenizer.py b/tests/word_tokenizer/test_mecab_tokenizer.py index 555f385..f28caca 100644 --- a/tests/word_tokenizer/test_mecab_tokenizer.py +++ b/tests/word_tokenizer/test_mecab_tokenizer.py @@ -2,6 +2,7 @@ from konoha.konoha_token import Token from konoha.word_tokenizer import WordTokenizer +from konoha.word_tokenizers.mecab_tokenizer import parse_feature_for_unidic def test_word_tokenize_with_mecab(): @@ -138,3 +139,37 @@ def test_word_tokenize_with_s3_system_dictionary(): expect = [Token(surface=w) for w in "吾輩 は 猫 で ある".split(" ")] result = tokenizer.tokenize("吾輩は猫である") assert expect == result + + +def test_parse_feature_for_unidic(): + token = parse_feature_for_unidic( + "吾輩\t代名詞,,,,,,ワガハイ,我が輩,吾輩,ワガハイ,吾輩,ワガハイ,混,,,,,,,体,ワガハイ,ワガハイ,ワガハイ,ワガハイ,0,,,11321954766299648,41189" + ) + result = ( + token.surface, + token.postag, + token.postag2, + token.postag3, + token.postag4, + token.inflection, + token.conjugation, + token.base_form, + token.normalized_form, + token.yomi, + token.pron, + ) + expect = ( + "吾輩", + "代名詞", + None, + None, + None, + None, + None, + "我が輩", + None, + "ワガハイ", + "ワガハイ", + ) + + assert result == expect