Skip to content

Commit

Permalink
Add figure id in HTML output
Browse files Browse the repository at this point in the history
  • Loading branch information
Belval authored Aug 16, 2024
2 parents 82ceab8 + e74a0cb commit 3fbe596
Show file tree
Hide file tree
Showing 7 changed files with 56 additions and 31 deletions.
6 changes: 5 additions & 1 deletion textractor/data/html_linearization_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,4 +78,8 @@ class HTMLLinearizationConfig(TextLinearizationConfig):

page_num_prefix: str = "<div>"

page_num_suffix: str = "</div>"
page_num_suffix: str = "</div>"

add_ids_to_html_tags: bool = False #: Adds Textract block id to the HTML markup. Only supported for HTML.

add_short_ids_to_html_tags: bool = False #: Adds the truncated (first 8 characters) Textract block id to the HTML markup. Only supported for HTML
5 changes: 3 additions & 2 deletions textractor/entities/key_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from textractor.data.constants import TextTypes
from textractor.data.text_linearization_config import TextLinearizationConfig
from textractor.visualizers.entitylist import EntityList
from textractor.utils.html_utils import add_id_to_html_tag


class KeyValue(DocumentEntity):
Expand Down Expand Up @@ -235,12 +236,12 @@ def get_text_and_words(
else " "
)
if config.add_prefixes_and_suffixes_in_text:
text = f"{config.key_value_prefix}{config.key_prefix}{key_text}{key_suffix}{value_text}{config.key_value_suffix}"
text = f"{add_id_to_html_tag(config.key_value_prefix, self.id, config)}{config.key_prefix}{key_text}{key_suffix}{value_text}{config.key_value_suffix}"
else:
text = f"{key_text}{config.same_paragraph_separator}{value_text}"

if config.add_prefixes_and_suffixes_as_words:
words += [Word(str(uuid.uuid4()), self.bbox, config.key_value_prefix, is_structure=True)] if config.key_value_prefix else []
words += [Word(str(uuid.uuid4()), self.bbox, add_id_to_html_tag(config.key_value_prefix, self.id, config), is_structure=True)] if config.key_value_prefix else []
if key_words:
words += (
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(self.key), config.key_prefix, is_structure=True)] if config.key_prefix else []) +
Expand Down
48 changes: 28 additions & 20 deletions textractor/entities/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
)
from textractor.data.text_linearization_config import TextLinearizationConfig
from textractor.utils.text_utils import group_elements_horizontally, linearize_children
from textractor.utils.html_utils import add_id_to_html_tag


class Layout(DocumentEntity):
Expand Down Expand Up @@ -122,7 +123,7 @@ def get_text_and_words(
)
if config.add_prefixes_and_suffixes_as_words:
return (
config.page_num_prefix + final_text + config.page_num_suffix,
add_id_to_html_tag(config.page_num_prefix, self.id, config) + final_text + config.page_num_suffix,
(
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), is_structure=True), config.page_num_prefix] if config.page_num_prefix else []) +
final_words +
Expand All @@ -135,17 +136,18 @@ def get_text_and_words(
final_words,
)
elif self.layout_type == LAYOUT_LIST:
final_text = config.list_layout_prefix
final_text = add_id_to_html_tag(config.list_layout_prefix, self.id, config)
final_words = []
for i, child in enumerate(
sorted(self.children, key=lambda x: x.reading_order)
):
child_text, child_words = child.get_text_and_words(config)
child_prefix = add_id_to_html_tag(config.list_element_prefix, child.id, config)
final_text += (
(
config.list_element_prefix
child_prefix
if (
child_text[: len(config.list_element_prefix)] != config.list_element_prefix and
child_text[:len(child_prefix)] != child_prefix and
config.add_prefixes_and_suffixes_in_text
) else ""
)
Expand All @@ -162,7 +164,7 @@ def get_text_and_words(
)
if config.add_prefixes_and_suffixes_as_words:
final_words += (
([Word(str(uuid.uuid4(), BoundingBox.enclosing_bbox(child_words)), config.list_element_prefix, is_structure=True)] if config.list_element_prefix else []) +
([Word(str(uuid.uuid4(), BoundingBox.enclosing_bbox(child_words)), add_id_to_html_tag(config.list_element_prefix, child.id, config), is_structure=True)] if config.list_element_prefix else []) +
child_words +
([Word(str(uuid.uuid4(), BoundingBox.enclosing_bbox(child_words)), config.list_element_suffix, is_structure=True)] if config.list_element_suffix else [])
)
Expand All @@ -174,10 +176,10 @@ def get_text_and_words(
self.children, config, no_new_lines=True
)
if config.add_prefixes_and_suffixes_in_text:
final_text = config.title_prefix + final_text + config.title_suffix
final_text = add_id_to_html_tag(config.title_prefix, self.id, config) + final_text + config.title_suffix
if config.add_prefixes_and_suffixes_as_words:
final_words = (
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.title_prefix, is_structure=True)] if config.title_prefix else []) +
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), add_id_to_html_tag(config.title_prefix, self.id, config), is_structure=True)] if config.title_prefix else []) +
final_words +
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.title_suffix, is_structure=True)] if config.title_suffix else [])
)
Expand All @@ -187,11 +189,11 @@ def get_text_and_words(
)
if config.add_prefixes_and_suffixes_in_text:
final_text = (
config.header_prefix + final_text + config.header_suffix
add_id_to_html_tag(config.header_prefix, self.id, config) + final_text + config.header_suffix
)
if config.add_prefixes_and_suffixes_as_words:
final_words = (
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.header_prefix, is_structure=True)] if config.header_prefix else []) +
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), add_id_to_html_tag(config.header_prefix, self.id, config), is_structure=True)] if config.header_prefix else []) +
final_words +
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.header_suffix, is_structure=True)] if config.header_suffix else [])
)
Expand All @@ -201,11 +203,11 @@ def get_text_and_words(
)
if config.add_prefixes_and_suffixes_in_text:
final_text = (
config.section_header_prefix + final_text + config.section_header_suffix
add_id_to_html_tag(config.section_header_prefix, self.id, config) + final_text + config.section_header_suffix
)
if config.add_prefixes_and_suffixes_as_words:
final_words = (
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.section_header_prefix, is_structure=True)] if config.section_header_prefix else []) +
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), add_id_to_html_tag(config.section_header_prefix, self.id, config), is_structure=True)] if config.section_header_prefix else []) +
final_words +
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.section_header_suffix, is_structure=True)] if config.section_header_suffix else [])
)
Expand All @@ -216,8 +218,14 @@ def get_text_and_words(
no_new_lines=True,
)
final_text = (
config.text_prefix + final_text + config.text_suffix
add_id_to_html_tag(config.text_prefix, self.id, config) + final_text + config.text_suffix
)
if config.add_prefixes_and_suffixes_as_words:
final_words = (
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), add_id_to_html_tag(config.text_prefix, self.id, config), is_structure=True)] if config.text_prefix else []) +
final_words +
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.text_suffix, is_structure=True)] if config.text_suffix else [])
)
else:
final_text, final_words = linearize_children(
self.children,
Expand All @@ -229,40 +237,40 @@ def get_text_and_words(
if config.add_prefixes_and_suffixes_in_text:
if self.layout_type == LAYOUT_TABLE:
final_text = (
config.table_layout_prefix + final_text + config.table_layout_suffix
add_id_to_html_tag(config.table_layout_prefix, self.id, config) + final_text + config.table_layout_suffix
)
elif self.layout_type == LAYOUT_KEY_VALUE:
final_text = (
config.key_value_layout_prefix + final_text + config.key_value_layout_suffix
add_id_to_html_tag(config.key_value_layout_prefix, self.id, config) + final_text + config.key_value_layout_suffix
)
elif self.layout_type == LAYOUT_FIGURE:
final_text = (
config.figure_layout_prefix + final_text + config.figure_layout_suffix
add_id_to_html_tag(config.figure_layout_prefix, self.id, config) + final_text + config.figure_layout_suffix
)
elif self.layout_type == LAYOUT_ENTITY:
final_text = (
config.entity_layout_prefix + final_text + config.entity_layout_suffix
add_id_to_html_tag(config.entity_layout_prefix, self.id, config) + final_text + config.entity_layout_suffix
)
elif self.layout_type == LAYOUT_FOOTER:
final_text = (
config.footer_layout_prefix + final_text + config.footer_layout_suffix
add_id_to_html_tag(config.footer_layout_prefix, self.id, config) + final_text + config.footer_layout_suffix
)
if config.add_prefixes_and_suffixes_as_words:
if self.layout_type == LAYOUT_TABLE:
final_words = (
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.table_layout_prefix, is_structure=True)] if config.table_layout_prefix else []) +
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), add_id_to_html_tag(config.table_layout_prefix, self.id, config), is_structure=True)] if config.table_layout_prefix else []) +
final_words +
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.table_layout_suffix, is_structure=True)] if config.table_layout_suffix else [])
)
elif self.layout_type == LAYOUT_KEY_VALUE:
final_words = (
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.key_value_layout_prefix, is_structure=True)] if config.key_value_layout_prefix else []) +
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), add_id_to_html_tag(config.key_value_layout_prefix, self.id, config), is_structure=True)] if config.key_value_layout_prefix else []) +
final_words +
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.key_value_layout_suffix, is_structure=True)] if config.key_value_layout_suffix else [])
)
elif self.layout_type == LAYOUT_FIGURE:
final_words = (
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.figure_layout_prefix, is_structure=True)] if config.figure_layout_prefix else []) +
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), add_id_to_html_tag(config.figure_layout_prefix, self.id, config), is_structure=True)] if config.figure_layout_prefix else []) +
final_words +
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.figure_layout_suffix, is_structure=True)] if config.figure_layout_suffix else [])
)
Expand Down
6 changes: 3 additions & 3 deletions textractor/entities/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from textractor.utils.text_utils import group_elements_horizontally, linearize_children
from textractor.data.text_linearization_config import TextLinearizationConfig
from textractor.data.html_linearization_config import HTMLLinearizationConfig

from textractor.utils.html_utils import add_id_to_html_tag

class Table(DocumentEntity):
"""
Expand Down Expand Up @@ -684,7 +684,7 @@ def get_text_and_words(
if len(words_) < local_config.table_min_table_words:
return linearize_children(words_, config=config)

words = [Word(str(uuid.uuid4()), self.bbox, local_config.table_prefix)] if local_config.table_prefix else []
words = [Word(str(uuid.uuid4()), self.bbox, add_id_to_html_tag(local_config.table_prefix, self.id, local_config))] if local_config.table_prefix else []
rows = sorted([(key, list(group)) for key, group in itertools.groupby(
self.table_cells, key=lambda cell: cell.row_index
)], key=lambda r: r[0])
Expand Down Expand Up @@ -858,7 +858,7 @@ def get_text_and_words(
w.table_id = str(self.id)
w.table_bbox = self.bbox

text = (local_config.table_prefix if local_config.add_prefixes_and_suffixes_in_text else "")
text = (add_id_to_html_tag(local_config.table_prefix, self.id, local_config) if local_config.add_prefixes_and_suffixes_in_text else "")
# Markdown
if local_config.table_linearization_format == "markdown":
df = self.to_pandas(
Expand Down
5 changes: 3 additions & 2 deletions textractor/entities/value.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from textractor.visualizers.entitylist import EntityList
from textractor.data.text_linearization_config import TextLinearizationConfig
from textractor.utils.text_utils import linearize_children
from textractor.utils.html_utils import add_id_to_html_tag


class Value(DocumentEntity):
Expand Down Expand Up @@ -171,15 +172,15 @@ def get_text_and_words(
no_new_lines=config.remove_new_lines_in_leaf_elements,
)
if config.add_prefixes_and_suffixes_in_text:
text = config.value_prefix + text + config.value_suffix
text = add_id_to_html_tag(config.value_prefix, self.id, config) + text + config.value_suffix
if config.add_prefixes_and_suffixes_as_words:
words = (
(
[
Word(
str(uuid.uuid4()),
self.bbox,
config.value_prefix,
add_id_to_html_tag(config.value_prefix, self.id, config),
is_structure=True,
is_clickable=(
bool(words) and words[0] in [config.selection_element_selected, config.selection_element_not_selected]
Expand Down
6 changes: 3 additions & 3 deletions textractor/parsers/response_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,7 +531,7 @@ def _create_signature_objects(
if signature not in signatures_added:
signatures_added.add(signature)
layout = Layout(
entity_id=signature.id,
entity_id=str(uuid.uuid4()),
bbox=signature.bbox,
label=LAYOUT_ENTITY,
reading_order=-1,
Expand Down Expand Up @@ -1103,7 +1103,7 @@ def _create_table_objects(
if table not in table_added:
table_added.add(table)
layout = Layout(
entity_id=table.id,
entity_id=str(uuid.uuid4()),
bbox=table.bbox,
label=LAYOUT_TABLE,
reading_order=-1,
Expand Down Expand Up @@ -1292,7 +1292,7 @@ def parse_document_api_response(response: dict) -> Document:
if kv.id not in kv_added:
kv_added.add(kv.id)
layout = Layout(
entity_id=kv.id,
entity_id=str(uuid.uuid4()),
bbox=kv.bbox,
label=LAYOUT_KEY_VALUE,
reading_order=-1,
Expand Down
11 changes: 11 additions & 0 deletions textractor/utils/html_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from textractor.data.html_linearization_config import HTMLLinearizationConfig

def add_id_to_html_tag(prefix, id, config):
if not isinstance(config, HTMLLinearizationConfig) or not prefix:
return prefix
if config.add_ids_to_html_tags:
return prefix[:-1] + f' id="{id[:8]}"' + prefix[-1]
elif config.add_short_ids_to_html_tags:
return prefix[:-1] + f' id="{id}"' + prefix[-1]
else:
return prefix

0 comments on commit 3fbe596

Please sign in to comment.