From 3a4466f6249436cbaebb25562ee8c0dbbb5fa725 Mon Sep 17 00:00:00 2001 From: Daniel Roschka Date: Mon, 22 Jul 2024 06:46:48 +0200 Subject: [PATCH 1/4] Enclose white spaces in references Since version 0.22 gettext encloses file names in references which contain white spaces or tabs within First Strong Isolate (U+2068) and Pop Directional Isolate (U+2069). This commit adds the same behavior for Babel. --- babel/messages/pofile.py | 5 +++++ tests/messages/test_pofile.py | 40 +++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/babel/messages/pofile.py b/babel/messages/pofile.py index 89a924255..2a9740901 100644 --- a/babel/messages/pofile.py +++ b/babel/messages/pofile.py @@ -626,6 +626,11 @@ def _format_message(message, prefix=''): for filename, lineno in locations: location = filename.replace(os.sep, '/') + if " " in location or "\t" in location: + if not location.startswith("\u2068"): + location = "\u2068" + location + if not location.endswith("\u2069"): + location += "\u2069" if lineno and include_lineno: location = f"{location}:{lineno:d}" if location not in locs: diff --git a/tests/messages/test_pofile.py b/tests/messages/test_pofile.py index 99958b7b7..ccfa7ee34 100644 --- a/tests/messages/test_pofile.py +++ b/tests/messages/test_pofile.py @@ -841,6 +841,46 @@ def test_no_include_lineno(self): msgid "foo" msgstr ""''' + def test_white_space_in_location(self): + catalog = Catalog() + catalog.add('foo', locations=[('main.py', 1)]) + catalog.add('foo', locations=[('utils b.py', 3)]) + buf = BytesIO() + pofile.write_po(buf, catalog, omit_header=True, include_lineno=True) + assert buf.getvalue().strip() == b'''#: main.py:1 \xe2\x81\xa8utils b.py\xe2\x81\xa9:3 +msgid "foo" +msgstr ""''' + + def test_white_space_in_location_already_enclosed(self): + catalog = Catalog() + catalog.add('foo', locations=[('main.py', 1)]) + catalog.add('foo', locations=[('\u2068utils b.py\u2069', 3)]) + buf = BytesIO() + pofile.write_po(buf, catalog, omit_header=True, include_lineno=True) + assert buf.getvalue().strip() == b'''#: main.py:1 \xe2\x81\xa8utils b.py\xe2\x81\xa9:3 +msgid "foo" +msgstr ""''' + + def test_tab_in_location(self): + catalog = Catalog() + catalog.add('foo', locations=[('main.py', 1)]) + catalog.add('foo', locations=[('utils\tb.py', 3)]) + buf = BytesIO() + pofile.write_po(buf, catalog, omit_header=True, include_lineno=True) + assert buf.getvalue().strip() == b'''#: main.py:1 \xe2\x81\xa8utils b.py\xe2\x81\xa9:3 +msgid "foo" +msgstr ""''' + + def test_tab_in_location_already_enclosed(self): + catalog = Catalog() + catalog.add('foo', locations=[('main.py', 1)]) + catalog.add('foo', locations=[('\u2068utils\tb.py\u2069', 3)]) + buf = BytesIO() + pofile.write_po(buf, catalog, omit_header=True, include_lineno=True) + assert buf.getvalue().strip() == b'''#: main.py:1 \xe2\x81\xa8utils b.py\xe2\x81\xa9:3 +msgid "foo" +msgstr ""''' + class PofileFunctionsTestCase(unittest.TestCase): From 0383e690547f4190a068cd232a586a4b4d9de440 Mon Sep 17 00:00:00 2001 From: Daniel Roschka Date: Fri, 26 Jul 2024 06:31:37 +0200 Subject: [PATCH 2/4] Add handling for reading references with spaces --- babel/messages/pofile.py | 71 ++++++++++++++++++++++++++++++---- tests/messages/test_pofile.py | 72 +++++++++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+), 7 deletions(-) diff --git a/babel/messages/pofile.py b/babel/messages/pofile.py index 2a9740901..65c8bf85b 100644 --- a/babel/messages/pofile.py +++ b/babel/messages/pofile.py @@ -80,6 +80,47 @@ def denormalize(string: str) -> str: return unescape(string) +def _extract_locations(line: str) -> list[str]: + """Extract locations from reference strings. + + Locations are extracted while properly handling First Strong + Isolate (U+2068) and Pop Directional Isolate (U+2069), used by + gettext to enclose filenames with spaces and tabs in their names. + """ + locations = [] + location = "" + in_filename = False + for c in line: + if c == "\u2068": + if in_filename: + raise ValueError("reference contains more First Strong Isolate characters, " + "than Pop Directional Isolate characters") + in_filename = True + continue + elif c == "\u2069": + if not in_filename: + raise ValueError("reference contains more Pop Directional Isolate characters, " + "than First Strong Isolate characters") + in_filename = False + continue + elif c == " ": + if in_filename: + location += c + elif location: + locations.append(location) + location = "" + else: + location += c + else: + if location: + if in_filename: + raise ValueError("reference contains more First Strong Isolate characters, " + "than Pop Directional Isolate characters") + locations.append(location) + + return locations + + class PoFileError(Exception): """Exception thrown by PoParser when an invalid po file is encountered.""" @@ -269,7 +310,7 @@ def _process_comment(self, line) -> None: self._finish_current_message() if line[1:].startswith(':'): - for location in line[2:].lstrip().split(): + for location in _extract_locations(line[2:]): pos = location.rfind(':') if pos >= 0: try: @@ -307,7 +348,10 @@ def parse(self, fileobj: IO[AnyStr] | Iterable[AnyStr]) -> None: if line[1:].startswith('~'): self._process_message_line(lineno, line[2:].lstrip(), obsolete=True) else: - self._process_comment(line) + try: + self._process_comment(line) + except ValueError as exc: + self._invalid_pofile(line, lineno, str(exc)) else: self._process_message_line(lineno, line) @@ -474,6 +518,23 @@ def normalize(string: str, prefix: str = '', width: int = 76) -> str: return '""\n' + '\n'.join([(prefix + escape(line)) for line in lines]) +def _enclose_filename_if_necessary(filename: str) -> str: + """Enclose filenames which include white spaces or tabs. + + Do the same as gettext and enclose filenames which contain white + spaces or tabs with First Strong Isolate (U+2068) and Pop + Directional Isolate (U+2069). + """ + if " " not in filename and "\t" not in filename: + return filename + + if not filename.startswith("\u2068"): + filename = "\u2068" + filename + if not filename.endswith("\u2069"): + filename += "\u2069" + return filename + + def write_po( fileobj: SupportsWrite[bytes], catalog: Catalog, @@ -626,11 +687,7 @@ def _format_message(message, prefix=''): for filename, lineno in locations: location = filename.replace(os.sep, '/') - if " " in location or "\t" in location: - if not location.startswith("\u2068"): - location = "\u2068" + location - if not location.endswith("\u2069"): - location += "\u2069" + location = _enclose_filename_if_necessary(location) if lineno and include_lineno: location = f"{location}:{lineno:d}" if location not in locs: diff --git a/tests/messages/test_pofile.py b/tests/messages/test_pofile.py index ccfa7ee34..e35a978f3 100644 --- a/tests/messages/test_pofile.py +++ b/tests/messages/test_pofile.py @@ -19,6 +19,7 @@ from babel.core import Locale from babel.messages import pofile from babel.messages.catalog import Catalog, Message +from babel.messages.pofile import _enclose_filename_if_necessary, _extract_locations from babel.util import FixedOffsetTimezone @@ -438,6 +439,19 @@ def test_missing_plural_in_the_middle(self): assert message.string[1] == '' assert message.string[2] == 'Vohs [text]' + def test_with_location(self): + buf = StringIO('''\ +#: main.py:1 \u2068filename with whitespace.py\u2069:123 +msgid "foo" +msgstr "bar" +''') + catalog = pofile.read_po(buf, locale='de_DE') + assert len(catalog) == 1 + message = catalog['foo'] + assert message.string == 'bar' + assert message.locations == [("main.py", 1), ("filename with whitespace.py", 123)] + + def test_abort_invalid_po_file(self): invalid_po = ''' msgctxt "" @@ -882,6 +896,19 @@ def test_tab_in_location_already_enclosed(self): msgstr ""''' +class RoundtripPoTestCase(unittest.TestCase): + + def test_enclosed_filenames_in_references(self): + catalog = Catalog() + catalog.add("foo", lineno=2, locations=[("main 1.py", 1)], string="") + catalog.add("bar", lineno=6, locations=[("other.py", 2)], string="") + catalog.add("baz", lineno=10, locations=[("main 1.py", 3), ("other.py", 4)], string="") + buf = BytesIO() + pofile.write_po(buf, catalog, omit_header=True, include_lineno=True) + buf.seek(0) + catalog2 = pofile.read_po(buf) + assert True is catalog.is_identical(catalog2) + class PofileFunctionsTestCase(unittest.TestCase): def test_unescape(self): @@ -904,6 +931,51 @@ def test_denormalize_on_msgstr_without_empty_first_line(self): assert expected_denormalized == pofile.denormalize(f'""\n{msgstr}') +@pytest.mark.parametrize(("line", "locations"), [ + ("\u2068file1.po\u2069", ["file1.po"]), + ("file1.po \u2068file 2.po\u2069 file3.po", ["file1.po", "file 2.po", "file3.po"]), + ("file1.po:1 \u2068file 2.po\u2069:2 file3.po:3", ["file1.po:1", "file 2.po:2", "file3.po:3"]), + ("\u2068file1.po\u2069:1 \u2068file\t2.po\u2069:2 file3.po:3", + ["file1.po:1", "file\t2.po:2", "file3.po:3"]), + ("file1.po file2.po", ["file1.po", "file2.po"]), + ("file1.po \u2068\u2069 file2.po", ["file1.po", "file2.po"]), +]) +def test_extract_locations_valid_reference(line, locations): + assert locations == _extract_locations(line) + + +@pytest.mark.parametrize(("line",), [ + ("\u2068file 1.po",), + ("file 1.po\u2069",), + ("\u2069file 1.po\u2068",), + ("\u2068file 1.po:1 \u2068file 2.po\u2069:2",), + ("\u2068file 1.po\u2069:1 file 2.po\u2069:2",), +]) +def test_extract_locations_invalid_reference(line): + with pytest.raises(ValueError): + _extract_locations(line) + + +@pytest.mark.parametrize(("filename",), [ + ("file.po",), + ("file_a.po",), + ("file-a.po",), + ("file\n.po",), + ("\u2068file.po\u2069",), + ("\u2068file a.po\u2069",), +]) +def test_enclose_filename_if_necessary_no_change(filename): + assert filename == _enclose_filename_if_necessary(filename) + + +@pytest.mark.parametrize(("filename",), [ + ("file a.po",), + ("file\ta.po",), +]) +def test_enclose_filename_if_necessary_enclosed(filename): + assert "\u2068" + filename + "\u2069" == _enclose_filename_if_necessary(filename) + + def test_unknown_language_roundtrip(): buf = StringIO(r''' msgid "" From 8b3a3245eb1df8ba0640beb11a50939bf138aaba Mon Sep 17 00:00:00 2001 From: Daniel Roschka Date: Sun, 18 Aug 2024 19:24:03 +0200 Subject: [PATCH 3/4] Use location comment instead of reference --- babel/messages/pofile.py | 14 +++++++------- tests/messages/test_pofile.py | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/babel/messages/pofile.py b/babel/messages/pofile.py index 65c8bf85b..65a7cd3f0 100644 --- a/babel/messages/pofile.py +++ b/babel/messages/pofile.py @@ -81,7 +81,7 @@ def denormalize(string: str) -> str: def _extract_locations(line: str) -> list[str]: - """Extract locations from reference strings. + """Extract locations from location comments. Locations are extracted while properly handling First Strong Isolate (U+2068) and Pop Directional Isolate (U+2069), used by @@ -93,14 +93,14 @@ def _extract_locations(line: str) -> list[str]: for c in line: if c == "\u2068": if in_filename: - raise ValueError("reference contains more First Strong Isolate characters, " - "than Pop Directional Isolate characters") + raise ValueError("location comment contains more First Strong Isolate " + "characters, than Pop Directional Isolate characters") in_filename = True continue elif c == "\u2069": if not in_filename: - raise ValueError("reference contains more Pop Directional Isolate characters, " - "than First Strong Isolate characters") + raise ValueError("location comment contains more Pop Directional Isolate " + "characters, than First Strong Isolate characters") in_filename = False continue elif c == " ": @@ -114,8 +114,8 @@ def _extract_locations(line: str) -> list[str]: else: if location: if in_filename: - raise ValueError("reference contains more First Strong Isolate characters, " - "than Pop Directional Isolate characters") + raise ValueError("location comment contains more First Strong Isolate " + "characters, than Pop Directional Isolate characters") locations.append(location) return locations diff --git a/tests/messages/test_pofile.py b/tests/messages/test_pofile.py index e35a978f3..24ea95c8b 100644 --- a/tests/messages/test_pofile.py +++ b/tests/messages/test_pofile.py @@ -898,7 +898,7 @@ def test_tab_in_location_already_enclosed(self): class RoundtripPoTestCase(unittest.TestCase): - def test_enclosed_filenames_in_references(self): + def test_enclosed_filenames_in_location_comment(self): catalog = Catalog() catalog.add("foo", lineno=2, locations=[("main 1.py", 1)], string="") catalog.add("bar", lineno=6, locations=[("other.py", 2)], string="") @@ -940,7 +940,7 @@ def test_denormalize_on_msgstr_without_empty_first_line(self): ("file1.po file2.po", ["file1.po", "file2.po"]), ("file1.po \u2068\u2069 file2.po", ["file1.po", "file2.po"]), ]) -def test_extract_locations_valid_reference(line, locations): +def test_extract_locations_valid_location_comment(line, locations): assert locations == _extract_locations(line) @@ -951,7 +951,7 @@ def test_extract_locations_valid_reference(line, locations): ("\u2068file 1.po:1 \u2068file 2.po\u2069:2",), ("\u2068file 1.po\u2069:1 file 2.po\u2069:2",), ]) -def test_extract_locations_invalid_reference(line): +def test_extract_locations_invalid_location_comment(line): with pytest.raises(ValueError): _extract_locations(line) From f9f15fa39d16f3be1a2b9be44666a0202f9abd3f Mon Sep 17 00:00:00 2001 From: Daniel Roschka Date: Wed, 28 Aug 2024 17:06:37 +0200 Subject: [PATCH 4/4] Add a fast path for locations without FSI and PDI --- babel/messages/pofile.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/babel/messages/pofile.py b/babel/messages/pofile.py index 65a7cd3f0..5cd65d867 100644 --- a/babel/messages/pofile.py +++ b/babel/messages/pofile.py @@ -87,6 +87,9 @@ def _extract_locations(line: str) -> list[str]: Isolate (U+2068) and Pop Directional Isolate (U+2069), used by gettext to enclose filenames with spaces and tabs in their names. """ + if "\u2068" not in line and "\u2069" not in line: + return line.lstrip().split() + locations = [] location = "" in_filename = False