Skip to content

Commit

Permalink
Add figure layout prefix and suffix
Browse files Browse the repository at this point in the history
  • Loading branch information
Belval authored May 10, 2024
2 parents 44a002d + e4421e0 commit 03024da
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 0 deletions.

Large diffs are not rendered by default.

36 changes: 36 additions & 0 deletions tests/test_get_text_and_words.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,3 +248,39 @@ def test_table_prefixes_and_suffixes_in_words(self):
"</value>",
]:
self.assertTrue(token in words, f"{token} is not in text")

def test_figure_layout_prefixes_and_suffixes_in_text_words(self):
if os.environ.get("CALL_TEXTRACT"):
document = self.extractor.analyze_document(
os.path.join(self.current_directory, "fixtures/matrix.png"),
features=[
TextractFeatures.LAYOUT,
]
)
with open(get_fixture_path(), "w") as f:
json.dump(document.response, f)
else:
document = Document.open(get_fixture_path())

config = TextLinearizationConfig(
figure_layout_prefix = "<figure>", #: Prefix for figure elements
figure_layout_suffix = "</figure>", #: Suffix for figure elements
add_prefixes_and_suffixes_in_text=True,
add_prefixes_and_suffixes_as_words=True,
)

text, words = document.get_text_and_words(config)

for token in [
"<figure>",
"</figure>",
]:
self.assertTrue(token in text, f"{token} is not in text")

words = [w.text for w in words]

for token in [
"<figure>",
"</figure>",
]:
self.assertTrue(token in words, f"{token} is not in words")
4 changes: 4 additions & 0 deletions textractor/data/text_linearization_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,10 @@ class TextLinearizationConfig:

value_suffix: str = "" #: Suffix for value elements

figure_layout_prefix: str = "" #: Prefix for figure layout elements

figure_layout_suffix: str = "" #: Suffix for figure layout elements

selection_element_selected: str = (
"[X]" #: Representation for selection element when selected
)
Expand Down
10 changes: 10 additions & 0 deletions textractor/entities/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,10 @@ def get_text_and_words(
final_text = (
config.key_value_layout_prefix + final_text + config.key_value_layout_suffix
)
elif self.layout_type == LAYOUT_FIGURE:
final_text = (
config.figure_layout_prefix + final_text + config.figure_layout_suffix
)
if config.add_prefixes_and_suffixes_as_words:
if self.layout_type == LAYOUT_TABLE:
final_words = (
Expand All @@ -245,6 +249,12 @@ def get_text_and_words(
final_words +
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.key_value_layout_suffix, is_structure=True)] if config.key_value_layout_suffix else [])
)
elif self.layout_type == LAYOUT_FIGURE:
final_words = (
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.figure_layout_prefix, is_structure=True)] if config.figure_layout_prefix else []) +
final_words +
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.figure_layout_suffix, is_structure=True)] if config.figure_layout_suffix else [])
)

while (
config.layout_element_separator * (config.max_number_of_consecutive_new_lines + 1) in final_text
Expand Down

0 comments on commit 03024da

Please sign in to comment.