-
Notifications
You must be signed in to change notification settings - Fork 0
/
ja_utils.py
137 lines (106 loc) · 4.24 KB
/
ja_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
'''
Japanese language processing for `tubelex` and `wikipedia-word-frequency-clean`.
'''
import fugashi # type: ignore
import os
from typing import Optional
import argparse
import re
# Word matching (not just) for Japanese
def _assert_safe_for_re_range(s: str) -> None:
'''
Sanity checks before we insert `s` it at the end of a regex range [...s].
'''
assert len(s) == len(set(s))
assert ']' not in s
assert '\\' not in s
assert ('-' not in s) or s.endswith('-')
def get_re_word(
allow_start_end: str = '',
allow_end: str = ''
) -> re.Pattern:
'''
Match words of len>=1. No decimal digits (\\d) at any position.
First and last character must be word-forming (\\w), i.e. alphabet, CJK, etc.
Note: \\w includes accented chars, CJK, etc.
\\d are decimals in many scripts, but not CJK.
Use `allow_start_end` to allow characters other than \\w, such as hyphen,
apostrophe (English) or wave dash (Japanese) to appear as the first or last
characters. (Note: does not work for adding digits.)
Use `allow_end` to allow characters to appear as last characters of a word longer
than a single character.
Useful both for space-separated languages (segmented with regex) and languages
requiring more complex segmentation (Chinese, Japanese).
'''
_assert_safe_for_re_range(allow_start_end)
_assert_safe_for_re_range(allow_end)
assert '-' not in allow_end
return re.compile(
rf'^(?!\d)[\w{allow_start_end}]'
rf'([^\d]*[\w{allow_end}{allow_start_end}])?(?<!\d)$'
)
def get_re_word_relaxed() -> re.Pattern:
return re.compile(
r'^([^\d]*(?!\d)[\w][^\d]*)$'
)
def get_re_split(no_split: str = '') -> re.Pattern:
'''
Match non-word sequences to split words. Such sequences may consist of:
- characters not in \\w or in `no_split`
- characters in \\d
For languages that can be segmented with a regex (not Chinese or Japanase).
Also see `get_re_word()`.
'''
_assert_safe_for_re_range(no_split)
# We need a non-capturing group '(?:...)' for split() to use the whole regex
return re.compile(rf'(?:[^\w{no_split}]|\d)+')
WAVE_DASH = '\u301C' # 〜 may look like fullwidth tilde ~
EN_DASH = '\u2013' # – may look like hyphen -
# Examples (test):
_re_word = get_re_word()
_re_split = get_re_split()
assert all(_re_word.fullmatch(w) for w in ['a', '亀', 'コアラ', 'Pú', 'A/B', 'bla-bla'])
assert not any(
_re_word.match(w) for w in ['', '1', 'a1', '1a', 'C3PIO', '/', '-', 'あ〜']
)
assert get_re_word(allow_start_end=WAVE_DASH).match('あ〜')
assert (
_re_split.split('a.b cč5dď-eé\'ff1+2*3.5koala') ==
['a', 'b', 'cč', 'dď', 'eé', 'ff', 'koala']
)
NORMALIZE_FULLWIDTH_TILDE: dict[int, int] = {
0xFF5E: 0x301C # fullwidth tilde '~' (common typo) => wave dash '〜'
}
def fugashi_tagger(dicdir: Optional[str]) -> fugashi.GenericTagger:
if dicdir is None:
return fugashi.Tagger('-O wakati') # -d/-r supplied automatically
# GenericTagger: we do not supply wrapper (not needed wor -O wakati)
mecabrc = os.path.join(dicdir, 'mecabrc')
return fugashi.GenericTagger(f'-O wakati -d {dicdir} -r {mecabrc}')
def add_tagger_arg_group(
parser: argparse.ArgumentParser,
title: Optional[str] = None
):
titled_group = parser.add_argument_group(title=title)
dic_group = titled_group.add_mutually_exclusive_group()
dic_group.add_argument(
'--dicdir', type=str, default=None,
help='Dictionary directory for fugashi/MeCab.'
)
dic_group.add_argument(
'--dictionary', '-D', choices=('unidic', 'unidic-lite'), default=None,
help='Dictionary (installed as a Python package) for fugashi/MeCab.'
)
def tagger_from_args(args: argparse.Namespace) -> fugashi.GenericTagger:
# We always specify dicdir EXPLICITLY
if args.dicdir is not None:
dicdir = args.dicdir
else:
if args.dictionary == 'unidic':
import unidic # type: ignore
dicdir = unidic.DICDIR
else:
assert args.dictionary is None or args.dictionary == 'unidic-lite'
import unidic_lite # type: ignore
dicdir = unidic_lite.DICDIR
return fugashi_tagger(dicdir)