Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

I18n of lttoolbox #176

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
run: ./configure --disable-static --enable-python-bindings
- name: build
run: make -j4 V=1 VERBOSE=1
- name: tests
run: make test
- name: make install
run: sudo make install
- name: tests
run: make test
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
*.pyc
**/*.deps/
/build/
/.vscode/
/.ccls-cache/
/*.pc
/aclocal.m4
Expand Down Expand Up @@ -79,13 +80,18 @@
/lttoolbox/lt-invert
/lttoolbox/lt-restrict
/lttoolbox/lt-apply-acx
/lttoolbox/formatmsg
/python/Makefile
/python/Makefile.in
/python/lttoolbox.i
/python/lttoolbox_wrap.cpp
/python/lttoolbox.py
/python/setup.py
/python/build*
/locales/Makefile
/locales/Makefile.in
*.res
*.dat
*.egg-info/
*.egg
**/.mypy_cache/
Expand Down
5 changes: 4 additions & 1 deletion Makefile.am
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
ACLOCAL_AMFLAGS=-I m4

SUBDIRS = $(PACKAGE_NAME)
SUBDIRS = $(PACKAGE_NAME) locales
DIST_SUBDIRS = $(PACKAGE_NAME)

if HAVE_PYTHON_BINDINGS
Expand All @@ -15,3 +15,6 @@ EXTRA_DIST=autogen.sh tests
# TODO: the below will use python3 if you run it on Arch Linux with no python2 installed
test: tests/run_tests.py
$(PYTHON) $<

export LOCALES_DIR=$(datadir)/$(PACKAGE_NAME)
export ALT_I18N_DATA=$(LOCALES_DIR)/lttoolbox.dat
2 changes: 1 addition & 1 deletion configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -105,4 +105,4 @@ then
PYTHON_INSTALL_PARAMS="--prefix=\$(prefix) --root=\$(DESTDIR)/"
fi

AC_OUTPUT([Makefile lttoolbox.pc lttoolbox/Makefile python/Makefile])
AC_OUTPUT([Makefile lttoolbox.pc lttoolbox/Makefile python/Makefile locales/Makefile])
7 changes: 7 additions & 0 deletions locales/Makefile.am
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
lttoolbox.dat: root.txt en.txt es.txt
genrb -d . root.txt en.txt es.txt
echo root.res en.res es.res > package_list.txt
pkgdata -p lttoolbox --mode archive -d . package_list.txt

localesdir = $(LOCALES_DIR)
dist_locales_DATA = lttoolbox.dat
2 changes: 2 additions & 0 deletions locales/en.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
en{
}
31 changes: 31 additions & 0 deletions locales/es.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
es{
lt_proc_desc{"procesar una corriente con un transductor de letras"}
analysis_desc{"análisis morfológico (comportamiento predeterminado)"}
bilingual_desc{"transferencia léxica"}
case_sensitive_desc{"utilizar el caso literal de los caracteres entrantes"}
debugged_gen_desc{"transformarse. generación con todas las cosas"}
decompose_nouns_desc{"Intenta descomponer palabras desconocidas."}
generation_desc{"generación morfológica"}
ignored_chars_desc{"especificar archivo con caracteres para ignorar"}
restore_chars_desc{"especificar archivo con caracteres para restauración diacrítica"}
tagged_gen_desc{"generación morfológica manteniendo formas léxicas"}
tagged_nm_gen_desc{"igual que -l pero sin marcas denominativas desconocidas"}
non_marked_gen_desc{"transformarse. generación sin marcas denominativas desconocidas"}
surf_bilingual_desc{"transferencia léxica con formas superficiales"}
post_generation_desc{"posgeneración"}
inter_generation_desc{"intergeneracional"}
sao_desc{"Procesamiento de entrada del sistema de anotaciones SAO"}
transliteration_desc{"aplicar diccionario de transliteración"}
version_desc{"versión"}
null_flush_desc{"salida de vaciado en el carácter nulo"}
dictionary_case_desc{"usar mayúsculas y minúsculas del diccionario en lugar de superficie"}
careful_case_desc{"usar mayúsculas y minúsculas del diccionario si están presentes; de lo contrario, superficie"}
no_default_ignore_desc{"omite la carga de los caracteres ignorados predeterminados"}
show_weights_desc{"Imprimir los pesos del análisis final (si corresponde)"}
analyses_desc{"No generar más de N análisis (si el transductor está ponderado, los N mejores análisis)"}
weight_classes_desc{"No generar más de N mejores clases de ponderación (donde los análisis con igual ponderación constituyen una clase)"}
compound_max_elements_desc{"Establecer elementos máximos compuestos"}
help_desc{"muestra esta ayuda"}
usage{"USO: "}
version{" versión "}
}
1 change: 1 addition & 0 deletions locales/package_list.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
root.res en.res es.res
158 changes: 158 additions & 0 deletions locales/root.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
root{
lt_proc_desc{"process a stream with a letter transducer"}
analysis_desc{"morphological analysis (default behavior)"}
bilingual_desc{"lexical transfer"}
case_sensitive_desc{"use the literal case of the incoming characters"}
debugged_gen_desc{"morph. generation with all the stuff"}
decompose_nouns_desc{"Try to decompound unknown words"}
generation_desc{"morphological generation"}
ignored_chars_desc{"specify file with characters to ignore"}
restore_chars_desc{"specify file with characters to diacritic restoration"}
tagged_gen_desc{"morphological generation keeping lexical forms"}
tagged_nm_gen_desc{"same as -l but without unknown word marks"}
non_marked_gen_desc{"morph. generation without unknown word marks"}
surf_bilingual_desc{"lexical transfer with surface forms"}
post_generation_desc{"post-generation"}
inter_generation_desc{"inter-generation"}
sao_desc{"SAO annotation system input processing"}
transliteration_desc{"apply transliteration dictionary"}
version_desc{"version"}
null_flush_desc{"flush output on the null character"}
dictionary_case_desc{"use dictionary case instead of surface"}
careful_case_desc{"use dictionary case if present, else surface"}
no_default_ignore_desc{"skips loading the default ignore characters"}
show_weights_desc{"Print final analysis weights (if any)"}
analyses_desc{"Output no more than N analyses (if the transducer is weighted, the N best analyses)"}
weight_classes_desc{"Output no more than N best weight classes (where analyses with equal weight constitute a class)"}
compound_max_elements_desc{"Set compound max elements"}
help_desc{"show this help"}
usage{"USAGE: "}
version{" version "}
modes{"Modes:"}
options{"Options:"}


lt_append_desc{"add sections to a compiled transducer"}
keep_desc{"in case of section name conflicts, keep the one from the first transducer"}
single_desc{"treat input transducers as one-sided"}

lt_apply_acx_desc{"apply an ACX file to a compiled transducer"}

lt_comp_desc{"build a letter transducer from a dictionary"}
debug_desc{"insert line numbers before each entry"}
keep_boundaries_desc{"keep morpheme boundaries"}
var_desc{"set language variant"}
alt_desc{"set alternative (monodix)"}
var_left_desc{"set left language variant (bidix)"}
var_right_desc{"set right language variant (bidix)"}
expect_hfst_desc{"expect HFST symbols"}
no_split_desc{"don't attempt to split into word and punctuation sections"}
jobs_desc{"use one cpu core per section when minimising, new section after 50k entries"}
verbose_desc{"compile verbosely"}

lt_compose_desc{"compose transducer1 with transducer2"}
inverted_desc{"run composition right-to-left on transducer1"}
anywhere_desc{"don't require anchored matches, let transducer2 optionally compose at any sub-path"}

lt_expand{"expand the contents of a dictionary file"}
lt_invert_desc{"reverse the direction of a compiled transducer"}

lt_paradigm_desc{"generate listings from a compiled transducer"}
analyser_desc{"FST is an analyser (tags on the right)"}
exclude_desc{"disregard paths containing TAG"}
sort_desc{"alphabetize the paths for each pattern"}

lt_print_desc{"dump a transducer to text in ATT format"}
alpha_desc{"print transducer alphabet"}
use_hfst_desc{"use HFST-compatible character escapes"}

lt_restrict_desc{"remove paths from a transducer"}
minimise_desc{"minimise transducers after deleting paths"}

lt_tmxcomp_desc{"build a letter transducer from a TMX translation memory"}
origin_code_desc{"the language code to be taken as lang1"}
meta_code_desc{"the language code to be taken as lang2"}
input_language{"input language"}
output_language{"output language"}

lt_tmxproc_desc{"process a stream with a letter transducer"}

lt_trim_desc{"trim a transducer to another transducer"}
match_section_desc{"A section with this name (id@type) will only be trimmed against a section with the same name. This argument may be used multiple times."}

ALT80000{"ERROR ALT80000: Invalid or no argument for {option}"}
ALT80010{"ERROR ALT80010: In {node_doc_url} on line {line_number}: Missing value attribute."}
ALT80020{"ERROR ALT80020: In {node_doc_url} on line {line_number}: Expected a single character in value attribute, but found {value_size}."}
ALT80030{"ERROR ALT80030: In {node_doc_url} on line {line_number}: Expected <{expected}> but found <{found}>."}
ALT80050{"ERROR ALT80050: Unable to access \"{file_name}\"."}
ALT80060{"ERROR ALT80060: Invalid format in file \"{file_name}\" on line {line_number}."}
ALT60070{"WARNING ALT60070: Multiple fsts in \"{file_name}\" will be disjuncted."}
ALT80080{"ERROR ALT80080: Transducer contains epsilon transition to a final state. Aborting."}
ALT80090{"ERROR ALT80090: Transducer contains initial epsilon loop. Aborting."}
ALT80100{"ERROR ALT80100: Cannot create empty buffer."}
ALT80110{"ERROR ALT80110: Parse error at the end of input."}
ALT80120{"ERROR ALT80120: Invalid dictionary (hint: the right side of an entry is empty)."}
ALT80121{"ERROR ALT80121: Invalid dictionary (hint: entry on the right beginning with whitespace)."}
ALT80122{"ERROR ALT80122: Invalid dictionary (hint: the left side of an entry is empty)."}
ALT80123{"ERROR ALT80123: Invalid dictionary (hint: entry on the left beginning with whitespace)."}
ALT80124{"ERROR ALT80124: Invalid dictionary (hint: entry on the beginning with whitespace)."}
ALT80140{"ERROR ALT80140: In file \"{file_name}\" on line {line_number}: Missing alphabet symbols."}
ALT60150{"WARNING ALT60150: Cannot insert <t/> from empty input. Ignoring. (You probably want to specify exact tags when deleting a word.)"}
ALT80160{"ERROR ALT80160: In file \"{file_name}\" on line {line_number}: Non-empty element \"<{name}>\" should be empty."}
ALT80170{"ERROR ALT80170: In file \"{file_name}\" on line {line_number}: Undefined symbol \"{symbol}\"."}
ALT80180{"ERROR ALT80180: In file \"{file_name}\" on line {line_number}: Invalid specification of element \"<{name}>\" in this context."}
ALT80190{"ERROR ALT80190: In file \"{file_name}\" on line {line_number}: Invalid construction."}
ALT80200{"ERROR ALT80200: In file \"{file_name}\" on line {line_number}: Expected \"<{slash_element}>\"."}
ALT60210{"WARNING ALT60210: In file \"{file_name}\" on line {line_number}: Entry begins with space."}
ALT80220{"ERROR ALT80220: In file \"{file_name}\" on line {line_number}: Paradigm refers to itself \"{paradigm_name}\"."}
ALT80230{"ERROR ALT80230: In file \"{file_name}\" on line {line_number}: Undefined paradigm \"{paradigm_name}\"."}
ALT80240{"ERROR ALT80240: In file \"{file_name}\" on line {line_number}: Invalid entry token."}
ALT80250{"ERROR ALT80250: In file \"{file_name}\" on line {line_number}: \"<{element_name}>\" element must specify non-void \"{attr_name}\" attribute."}
ALT80260{"ERROR ALT80260: In file \"{file_name}\" on line {line_number}: Parse error."}
ALT80270{"ERROR ALT80270: In file \"{file_name}\" on line {line_number}: Invalid inclusion of \"<{element_name}>\" into \"<{compiler_entry_element}>\"."}
ALT80280{"ERROR ALT80280: In file \"{file_name}\" on line {line_number}: Invalid node \"<{element_name}>\"."}
ALT80290{"ERROR ALT80290: I/O Error."}
ALT80300{"ERROR ALT80300: Out of range: {value}."}
ALT60320{"WARNING ALT60320: Matching case-sensitively since processor state size >= {max_case_insensitive_state_size}"}
ALT80330{"ERROR ALT80330: Unsupported transducer type for \"{transducer_first}\"."}
ALT60340{"WARNING ALT60340: CompoundAnalysis's MAX_COMBINATIONS exceeded for \"{input_word}\"\n"
" gave up at char {index} \"{char}\"."}
ALT60350{"WARNING ALT60350: Decomposition symbol {symbol} not found."}
ALT80360{"ERROR ALT80360: Unable to rewind file."}
ALT80370{"ERROR ALT80370: Unexpected trailing backslash."}
ALT60380{"WARNING ALT60380: section \"{section}\" appears in both transducers and will be overwritten!"}
ALT80390{"ERROR ALT80390: -l specified, but mode is lr."}
ALT80391{"ERROR ALT80391: -r specified, but mode is rl."}
ALT60410{"WARNING ALT60410: section {section_name} is empty! Skipping it..."}
ALT60420{"WARNING ALT60420: section {section_name} had no final state after composing! Skipping it..."}
ALT80430{"ERROR ALT80430: Composition gave empty transducer!"}
ALT60440{"WARNING ALT60440: unsupported locale, fallback to \"C\""}
ALT60450{"WARNING ALT60450: section {section_name} was not found in both transducers! Skipping if in just one..."}
ALT80460{"ERROR ALT80460: Trimming gave empty transducer!\n"
"Hint: There are no words in bilingual dictionary that match words in both monolingual dictionaries?"}
ALT80470{"ERROR ALT80470: Opening an unended sequence."}
ALT80471{"ERROR ALT80471: Ending an unopened sequence."}
ALT80490{"ERROR ALT80490: Using labels outside of a sequence."}
ALT80500{"ERROR ALT80500: Parsing regexp."}
ALT80510{"ERROR ALT80510: Unable to lowercase string \"{string}\".\n"
"Error code: {error_name}"}
ALT80511{"ERROR ALT80511: Unable to uppercase string \"{string}\".\n"
"Error code: {error_name}"}
ALT80512{"ERROR ALT80512: Unable to titlecase string \"{string}\".\n"
"Error code: {error_name}"}
ALT80513{"ERROR ALT80513: Caseless string comparison failed on \"{string_a}\" and \"{string_b}\".\n"
"Error code: {error_name}"}
ALT80550{"ERROR ALT80550: Trying to link nonexistent states ({source}, {target}, {tag})."}
ALT80560{"ERROR ALT80560: Empty set of final states."}
ALT80570{"ERROR ALT80570: Couldn't find {f_src}, {g_src} in state map."}
ALT80580{"ERROR ALT80580: Failed to read/write uint64_t."}
ALT80590{"ERROR ALT80590: Transducer has features that are unknown to this version of lttoolbox - upgrade!"}
ALT80600{"ERROR ALT80600: Unable to parse {type}."}
ALT80610{"ERROR ALT80610: Malformed input stream."}
ALT80620{"ERROR ALT80620: FST has features that are unknown to this version of lttoolbox - upgrade!"}
ALT80630{"ERROR ALT80630: Could not read {number} expected bytes from stream."}
ALT80640{"ERROR ALT80640: Can't deserialise {size} byte integer type: Can't deserialise size."}
ALT80650{"ERROR ALT80650: Can't deserialise {size} byte integer type: Can't deserialise byte."}
ALT80660{"ERROR ALT80660: Can't serialise const {size_a} byte integer type: Can't serialise size {size_b}."}
ALT80670{"ERROR ALT80670: Can't serialise const {size} byte integer type: Can't serialise byte {byte}."}
}
6 changes: 4 additions & 2 deletions lttoolbox/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ h_sources = acx.h alphabet.h att_compiler.h buffer.h cli.h compiler.h compressio
match_exe.h match_node.h match_state.h my_stdio.h node.h \
pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h symbol_iter.h \
transducer.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \
ustring.h sorted_vector.hpp
ustring.h sorted_vector.hpp i18n.h
cc_sources = acx.cc alphabet.cc att_compiler.cc cli.cc compiler.cc compression.cc entry_token.cc \
expander.cc file_utils.cc fst_processor.cc input_file.cc lt_locale.cc match_exe.cc \
match_node.cc match_state.cc node.cc pattern_list.cc \
Expand All @@ -14,7 +14,7 @@ cc_sources = acx.cc alphabet.cc att_compiler.cc cli.cc compiler.cc compression.c
library_includedir = $(includedir)/$(PACKAGE_NAME)
library_include_HEADERS = $(h_sources)

bin_PROGRAMS = lt-comp lt-proc lt-expand lt-paradigm lt-tmxcomp lt-tmxproc lt-print lt-trim lt-compose lt-append lsx-comp lt-invert lt-restrict lt-apply-acx
bin_PROGRAMS = lt-comp lt-proc lt-expand lt-paradigm lt-tmxcomp lt-tmxproc lt-print lt-trim lt-compose lt-append lsx-comp lt-invert lt-restrict lt-apply-acx formatmsg
instdir = lttoolbox

lib_LTLIBRARIES= liblttoolbox.la
Expand All @@ -29,6 +29,7 @@ lttoolboxlib = $(prefix)/lib
lttoolbox_DATA = dix.dtd dix.rng dix.rnc acx.rng xsd/dix.xsd xsd/acx.xsd

LDADD = liblttoolbox.la $(PTHREAD_LIBS)
AM_CPPFLAGS = -DALT_I18N_DATA='"$(ALT_I18N_DATA)"'
AM_LDFLAGS = -llttoolbox $(LIBXML_LIBS) $(ICU_LIBS)

lt_append_SOURCES = lt_append.cc
Expand All @@ -45,6 +46,7 @@ lsx_comp_SOURCES = lt_comp.cc
lt_invert_SOURCES = lt_invert.cc
lt_restrict_SOURCES = lt_restrict.cc
lt_apply_acx_SOURCES = lt_apply_acx.cc
formatmsg_SOURCES = formatmsg.cc

#lt-validate-dictionary: Makefile.am validate-header.sh
# @echo "Creating lt-validate-dictionary script"
Expand Down
19 changes: 13 additions & 6 deletions lttoolbox/acx.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,40 +16,47 @@
*/
#include <lttoolbox/acx.h>
#include <lttoolbox/xml_walk_util.h>
#include <iostream>
#include <unicode/ustream.h>
#include <lttoolbox/i18n.h>

const xmlChar* CHAR_NODE = (const xmlChar*)"char";
const xmlChar* EQUIV_NODE = (const xmlChar*)"equiv-char";
const char* VALUE_ATTR = "value";

int32_t get_val(xmlNode* node)
{
I18n i18n {ALT_I18N_DATA, "lttoolbox"};
UString s = getattr(node, VALUE_ATTR);
if (s.empty()) {
error_and_die(node, "Missing value attribute.");
i18n.error("ALT80010", {"node_doc_url", "line_number"},
{(char*)node->doc->URL, node->line}, true);
}
std::vector<int32_t> v;
ustring_to_vec32(s, v);
if (v.size() > 1) {
error_and_die(node, "Expected a single character in value attribute, but found %d.", v.size());
i18n.error("ALT80020", {"node_doc_url", "line_number", "value_size"},
{(char*)node->doc->URL, node->line, std::to_string(v.size()).c_str()}, true);
}
return v[0];
}

std::map<int32_t, sorted_vector<int32_t>> readACX(const char* file)
{
I18n i18n {ALT_I18N_DATA, "lttoolbox"};
std::map<int32_t, sorted_vector<int32_t>> acx;
xmlNode* top_node = load_xml(file);
for (auto char_node : children(top_node)) {
if (!xmlStrEqual(char_node->name, CHAR_NODE)) {
error_and_die(char_node, "Expected <char> but found <%s>.",
(const char*)char_node->name);
i18n.error("ALT80030", {"node_doc_url", "line_number", "expected", "found"},
{(char*)char_node->doc->URL, char_node->line, "char", (const char*)char_node->name}, true);
}
int32_t key = get_val(char_node);
sorted_vector<int32_t> vec;
for (auto equiv_node : children(char_node)) {
if (!xmlStrEqual(equiv_node->name, EQUIV_NODE)) {
error_and_die(char_node, "Expected <equiv-char> but found <%s>.",
(const char*)equiv_node->name);
i18n.error("ALT80030", {"node_doc_url", "line_number", "expected", "found"},
{(char*)char_node->doc->URL, char_node->line, "equiv-char", (const char*)equiv_node->name}, true);
}
vec.insert(get_val(equiv_node));
}
Expand Down
Loading