From 3a4466f6249436cbaebb25562ee8c0dbbb5fa725 Mon Sep 17 00:00:00 2001
From: Daniel Roschka <danielroschka@phoenitydawn.de>
Date: Mon, 22 Jul 2024 06:46:48 +0200
Subject: [PATCH 1/4] Enclose white spaces in references

Since version 0.22 gettext encloses file names in references which
contain white spaces or tabs within First Strong Isolate (U+2068) and
Pop Directional Isolate (U+2069). This commit adds the same behavior for
Babel.
---
 babel/messages/pofile.py      |  5 +++++
 tests/messages/test_pofile.py | 40 +++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/babel/messages/pofile.py b/babel/messages/pofile.py
index 89a924255..2a9740901 100644
--- a/babel/messages/pofile.py
+++ b/babel/messages/pofile.py
@@ -626,6 +626,11 @@ def _format_message(message, prefix=''):
 
             for filename, lineno in locations:
                 location = filename.replace(os.sep, '/')
+                if " " in location or "\t" in location:
+                    if not location.startswith("\u2068"):
+                        location = "\u2068" + location
+                    if not location.endswith("\u2069"):
+                        location += "\u2069"
                 if lineno and include_lineno:
                     location = f"{location}:{lineno:d}"
                 if location not in locs:
diff --git a/tests/messages/test_pofile.py b/tests/messages/test_pofile.py
index 99958b7b7..ccfa7ee34 100644
--- a/tests/messages/test_pofile.py
+++ b/tests/messages/test_pofile.py
@@ -841,6 +841,46 @@ def test_no_include_lineno(self):
 msgid "foo"
 msgstr ""'''
 
+    def test_white_space_in_location(self):
+        catalog = Catalog()
+        catalog.add('foo', locations=[('main.py', 1)])
+        catalog.add('foo', locations=[('utils b.py', 3)])
+        buf = BytesIO()
+        pofile.write_po(buf, catalog, omit_header=True, include_lineno=True)
+        assert buf.getvalue().strip() == b'''#: main.py:1 \xe2\x81\xa8utils b.py\xe2\x81\xa9:3
+msgid "foo"
+msgstr ""'''
+
+    def test_white_space_in_location_already_enclosed(self):
+        catalog = Catalog()
+        catalog.add('foo', locations=[('main.py', 1)])
+        catalog.add('foo', locations=[('\u2068utils b.py\u2069', 3)])
+        buf = BytesIO()
+        pofile.write_po(buf, catalog, omit_header=True, include_lineno=True)
+        assert buf.getvalue().strip() == b'''#: main.py:1 \xe2\x81\xa8utils b.py\xe2\x81\xa9:3
+msgid "foo"
+msgstr ""'''
+
+    def test_tab_in_location(self):
+        catalog = Catalog()
+        catalog.add('foo', locations=[('main.py', 1)])
+        catalog.add('foo', locations=[('utils\tb.py', 3)])
+        buf = BytesIO()
+        pofile.write_po(buf, catalog, omit_header=True, include_lineno=True)
+        assert buf.getvalue().strip() == b'''#: main.py:1 \xe2\x81\xa8utils        b.py\xe2\x81\xa9:3
+msgid "foo"
+msgstr ""'''
+
+    def test_tab_in_location_already_enclosed(self):
+        catalog = Catalog()
+        catalog.add('foo', locations=[('main.py', 1)])
+        catalog.add('foo', locations=[('\u2068utils\tb.py\u2069', 3)])
+        buf = BytesIO()
+        pofile.write_po(buf, catalog, omit_header=True, include_lineno=True)
+        assert buf.getvalue().strip() == b'''#: main.py:1 \xe2\x81\xa8utils        b.py\xe2\x81\xa9:3
+msgid "foo"
+msgstr ""'''
+
 
 class PofileFunctionsTestCase(unittest.TestCase):
 

From 0383e690547f4190a068cd232a586a4b4d9de440 Mon Sep 17 00:00:00 2001
From: Daniel Roschka <danielroschka@phoenitydawn.de>
Date: Fri, 26 Jul 2024 06:31:37 +0200
Subject: [PATCH 2/4] Add handling for reading references with spaces

---
 babel/messages/pofile.py      | 71 ++++++++++++++++++++++++++++++----
 tests/messages/test_pofile.py | 72 +++++++++++++++++++++++++++++++++++
 2 files changed, 136 insertions(+), 7 deletions(-)

diff --git a/babel/messages/pofile.py b/babel/messages/pofile.py
index 2a9740901..65c8bf85b 100644
--- a/babel/messages/pofile.py
+++ b/babel/messages/pofile.py
@@ -80,6 +80,47 @@ def denormalize(string: str) -> str:
         return unescape(string)
 
 
+def _extract_locations(line: str) -> list[str]:
+    """Extract locations from reference strings.
+
+    Locations are extracted while properly handling First Strong
+    Isolate (U+2068) and Pop Directional Isolate (U+2069), used by
+    gettext to enclose filenames with spaces and tabs in their names.
+    """
+    locations = []
+    location = ""
+    in_filename = False
+    for c in line:
+        if c == "\u2068":
+            if in_filename:
+                raise ValueError("reference contains more First Strong Isolate characters, "
+                                 "than Pop Directional Isolate characters")
+            in_filename = True
+            continue
+        elif c == "\u2069":
+            if not in_filename:
+                raise ValueError("reference contains more Pop Directional Isolate characters, "
+                                 "than First Strong Isolate characters")
+            in_filename = False
+            continue
+        elif c == " ":
+            if in_filename:
+                location += c
+            elif location:
+                locations.append(location)
+                location = ""
+        else:
+            location += c
+    else:
+        if location:
+            if in_filename:
+                raise ValueError("reference contains more First Strong Isolate characters, "
+                                 "than Pop Directional Isolate characters")
+            locations.append(location)
+
+    return locations
+
+
 class PoFileError(Exception):
     """Exception thrown by PoParser when an invalid po file is encountered."""
 
@@ -269,7 +310,7 @@ def _process_comment(self, line) -> None:
         self._finish_current_message()
 
         if line[1:].startswith(':'):
-            for location in line[2:].lstrip().split():
+            for location in _extract_locations(line[2:]):
                 pos = location.rfind(':')
                 if pos >= 0:
                     try:
@@ -307,7 +348,10 @@ def parse(self, fileobj: IO[AnyStr] | Iterable[AnyStr]) -> None:
                 if line[1:].startswith('~'):
                     self._process_message_line(lineno, line[2:].lstrip(), obsolete=True)
                 else:
-                    self._process_comment(line)
+                    try:
+                        self._process_comment(line)
+                    except ValueError as exc:
+                        self._invalid_pofile(line, lineno, str(exc))
             else:
                 self._process_message_line(lineno, line)
 
@@ -474,6 +518,23 @@ def normalize(string: str, prefix: str = '', width: int = 76) -> str:
     return '""\n' + '\n'.join([(prefix + escape(line)) for line in lines])
 
 
+def _enclose_filename_if_necessary(filename: str) -> str:
+    """Enclose filenames which include white spaces or tabs.
+
+    Do the same as gettext and enclose filenames which contain white
+    spaces or tabs with First Strong Isolate (U+2068) and Pop
+    Directional Isolate (U+2069).
+    """
+    if " " not in filename and "\t" not in filename:
+        return filename
+
+    if not filename.startswith("\u2068"):
+        filename = "\u2068" + filename
+    if not filename.endswith("\u2069"):
+        filename += "\u2069"
+    return filename
+
+
 def write_po(
     fileobj: SupportsWrite[bytes],
     catalog: Catalog,
@@ -626,11 +687,7 @@ def _format_message(message, prefix=''):
 
             for filename, lineno in locations:
                 location = filename.replace(os.sep, '/')
-                if " " in location or "\t" in location:
-                    if not location.startswith("\u2068"):
-                        location = "\u2068" + location
-                    if not location.endswith("\u2069"):
-                        location += "\u2069"
+                location = _enclose_filename_if_necessary(location)
                 if lineno and include_lineno:
                     location = f"{location}:{lineno:d}"
                 if location not in locs:
diff --git a/tests/messages/test_pofile.py b/tests/messages/test_pofile.py
index ccfa7ee34..e35a978f3 100644
--- a/tests/messages/test_pofile.py
+++ b/tests/messages/test_pofile.py
@@ -19,6 +19,7 @@
 from babel.core import Locale
 from babel.messages import pofile
 from babel.messages.catalog import Catalog, Message
+from babel.messages.pofile import _enclose_filename_if_necessary, _extract_locations
 from babel.util import FixedOffsetTimezone
 
 
@@ -438,6 +439,19 @@ def test_missing_plural_in_the_middle(self):
         assert message.string[1] == ''
         assert message.string[2] == 'Vohs [text]'
 
+    def test_with_location(self):
+        buf = StringIO('''\
+#: main.py:1 \u2068filename with whitespace.py\u2069:123
+msgid "foo"
+msgstr "bar"
+''')
+        catalog = pofile.read_po(buf, locale='de_DE')
+        assert len(catalog) == 1
+        message = catalog['foo']
+        assert message.string == 'bar'
+        assert message.locations == [("main.py", 1), ("filename with whitespace.py", 123)]
+
+
     def test_abort_invalid_po_file(self):
         invalid_po = '''
             msgctxt ""
@@ -882,6 +896,19 @@ def test_tab_in_location_already_enclosed(self):
 msgstr ""'''
 
 
+class RoundtripPoTestCase(unittest.TestCase):
+
+    def test_enclosed_filenames_in_references(self):
+        catalog = Catalog()
+        catalog.add("foo", lineno=2, locations=[("main 1.py", 1)], string="")
+        catalog.add("bar", lineno=6, locations=[("other.py", 2)], string="")
+        catalog.add("baz", lineno=10, locations=[("main 1.py", 3), ("other.py", 4)], string="")
+        buf = BytesIO()
+        pofile.write_po(buf, catalog, omit_header=True, include_lineno=True)
+        buf.seek(0)
+        catalog2 = pofile.read_po(buf)
+        assert True is catalog.is_identical(catalog2)
+
 class PofileFunctionsTestCase(unittest.TestCase):
 
     def test_unescape(self):
@@ -904,6 +931,51 @@ def test_denormalize_on_msgstr_without_empty_first_line(self):
         assert expected_denormalized == pofile.denormalize(f'""\n{msgstr}')
 
 
+@pytest.mark.parametrize(("line", "locations"), [
+    ("\u2068file1.po\u2069", ["file1.po"]),
+    ("file1.po \u2068file 2.po\u2069 file3.po", ["file1.po", "file 2.po", "file3.po"]),
+    ("file1.po:1 \u2068file 2.po\u2069:2 file3.po:3", ["file1.po:1", "file 2.po:2", "file3.po:3"]),
+    ("\u2068file1.po\u2069:1 \u2068file\t2.po\u2069:2 file3.po:3",
+     ["file1.po:1", "file\t2.po:2", "file3.po:3"]),
+    ("file1.po  file2.po", ["file1.po", "file2.po"]),
+    ("file1.po \u2068\u2069 file2.po", ["file1.po", "file2.po"]),
+])
+def test_extract_locations_valid_reference(line, locations):
+    assert locations == _extract_locations(line)
+
+
+@pytest.mark.parametrize(("line",), [
+    ("\u2068file 1.po",),
+    ("file 1.po\u2069",),
+    ("\u2069file 1.po\u2068",),
+    ("\u2068file 1.po:1 \u2068file 2.po\u2069:2",),
+    ("\u2068file 1.po\u2069:1 file 2.po\u2069:2",),
+])
+def test_extract_locations_invalid_reference(line):
+    with pytest.raises(ValueError):
+        _extract_locations(line)
+
+
+@pytest.mark.parametrize(("filename",), [
+    ("file.po",),
+    ("file_a.po",),
+    ("file-a.po",),
+    ("file\n.po",),
+    ("\u2068file.po\u2069",),
+    ("\u2068file a.po\u2069",),
+])
+def test_enclose_filename_if_necessary_no_change(filename):
+    assert filename == _enclose_filename_if_necessary(filename)
+
+
+@pytest.mark.parametrize(("filename",), [
+    ("file a.po",),
+    ("file\ta.po",),
+])
+def test_enclose_filename_if_necessary_enclosed(filename):
+    assert "\u2068" + filename + "\u2069" == _enclose_filename_if_necessary(filename)
+
+
 def test_unknown_language_roundtrip():
     buf = StringIO(r'''
 msgid ""

From 8b3a3245eb1df8ba0640beb11a50939bf138aaba Mon Sep 17 00:00:00 2001
From: Daniel Roschka <danielroschka@phoenitydawn.de>
Date: Sun, 18 Aug 2024 19:24:03 +0200
Subject: [PATCH 3/4] Use location comment instead of reference

---
 babel/messages/pofile.py      | 14 +++++++-------
 tests/messages/test_pofile.py |  6 +++---
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/babel/messages/pofile.py b/babel/messages/pofile.py
index 65c8bf85b..65a7cd3f0 100644
--- a/babel/messages/pofile.py
+++ b/babel/messages/pofile.py
@@ -81,7 +81,7 @@ def denormalize(string: str) -> str:
 
 
 def _extract_locations(line: str) -> list[str]:
-    """Extract locations from reference strings.
+    """Extract locations from location comments.
 
     Locations are extracted while properly handling First Strong
     Isolate (U+2068) and Pop Directional Isolate (U+2069), used by
@@ -93,14 +93,14 @@ def _extract_locations(line: str) -> list[str]:
     for c in line:
         if c == "\u2068":
             if in_filename:
-                raise ValueError("reference contains more First Strong Isolate characters, "
-                                 "than Pop Directional Isolate characters")
+                raise ValueError("location comment contains more First Strong Isolate "
+                                 "characters, than Pop Directional Isolate characters")
             in_filename = True
             continue
         elif c == "\u2069":
             if not in_filename:
-                raise ValueError("reference contains more Pop Directional Isolate characters, "
-                                 "than First Strong Isolate characters")
+                raise ValueError("location comment contains more Pop Directional Isolate "
+                                 "characters, than First Strong Isolate characters")
             in_filename = False
             continue
         elif c == " ":
@@ -114,8 +114,8 @@ def _extract_locations(line: str) -> list[str]:
     else:
         if location:
             if in_filename:
-                raise ValueError("reference contains more First Strong Isolate characters, "
-                                 "than Pop Directional Isolate characters")
+                raise ValueError("location comment contains more First Strong Isolate "
+                                 "characters, than Pop Directional Isolate characters")
             locations.append(location)
 
     return locations
diff --git a/tests/messages/test_pofile.py b/tests/messages/test_pofile.py
index e35a978f3..24ea95c8b 100644
--- a/tests/messages/test_pofile.py
+++ b/tests/messages/test_pofile.py
@@ -898,7 +898,7 @@ def test_tab_in_location_already_enclosed(self):
 
 class RoundtripPoTestCase(unittest.TestCase):
 
-    def test_enclosed_filenames_in_references(self):
+    def test_enclosed_filenames_in_location_comment(self):
         catalog = Catalog()
         catalog.add("foo", lineno=2, locations=[("main 1.py", 1)], string="")
         catalog.add("bar", lineno=6, locations=[("other.py", 2)], string="")
@@ -940,7 +940,7 @@ def test_denormalize_on_msgstr_without_empty_first_line(self):
     ("file1.po  file2.po", ["file1.po", "file2.po"]),
     ("file1.po \u2068\u2069 file2.po", ["file1.po", "file2.po"]),
 ])
-def test_extract_locations_valid_reference(line, locations):
+def test_extract_locations_valid_location_comment(line, locations):
     assert locations == _extract_locations(line)
 
 
@@ -951,7 +951,7 @@ def test_extract_locations_valid_reference(line, locations):
     ("\u2068file 1.po:1 \u2068file 2.po\u2069:2",),
     ("\u2068file 1.po\u2069:1 file 2.po\u2069:2",),
 ])
-def test_extract_locations_invalid_reference(line):
+def test_extract_locations_invalid_location_comment(line):
     with pytest.raises(ValueError):
         _extract_locations(line)
 

From f9f15fa39d16f3be1a2b9be44666a0202f9abd3f Mon Sep 17 00:00:00 2001
From: Daniel Roschka <danielroschka@phoenitydawn.de>
Date: Wed, 28 Aug 2024 17:06:37 +0200
Subject: [PATCH 4/4] Add a fast path for locations without FSI and PDI

---
 babel/messages/pofile.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/babel/messages/pofile.py b/babel/messages/pofile.py
index 65a7cd3f0..5cd65d867 100644
--- a/babel/messages/pofile.py
+++ b/babel/messages/pofile.py
@@ -87,6 +87,9 @@ def _extract_locations(line: str) -> list[str]:
     Isolate (U+2068) and Pop Directional Isolate (U+2069), used by
     gettext to enclose filenames with spaces and tabs in their names.
     """
+    if "\u2068" not in line and "\u2069" not in line:
+        return line.lstrip().split()
+
     locations = []
     location = ""
     in_filename = False