From 8da9476adee722cf128457171215e0f599157794 Mon Sep 17 00:00:00 2001 From: Edouard Belval Date: Tue, 13 Aug 2024 17:20:31 +0000 Subject: [PATCH 1/2] Add id= in HTML tags --- textractor/data/html_linearization_config.py | 6 ++- textractor/entities/key_value.py | 5 +- textractor/entities/layout.py | 48 ++++++++++++-------- textractor/entities/table.py | 6 +-- textractor/entities/value.py | 5 +- textractor/utils/html_utils.py | 9 ++++ 6 files changed, 51 insertions(+), 28 deletions(-) create mode 100644 textractor/utils/html_utils.py diff --git a/textractor/data/html_linearization_config.py b/textractor/data/html_linearization_config.py index 3ecffaab..d34e77ab 100644 --- a/textractor/data/html_linearization_config.py +++ b/textractor/data/html_linearization_config.py @@ -78,4 +78,8 @@ class HTMLLinearizationConfig(TextLinearizationConfig): page_num_prefix: str = "
" - page_num_suffix: str = "
" \ No newline at end of file + page_num_suffix: str = "" + + add_ids_to_components: bool = False #: Adds Textract block id to the HTML markup. Only supported for HTML. + + add_short_ids_to_components: bool = False #: Adds the truncated (first 8 characters) Textract block id to the HTML markup. Only supported for HTML \ No newline at end of file diff --git a/textractor/entities/key_value.py b/textractor/entities/key_value.py index c2507d42..e1e14ecd 100644 --- a/textractor/entities/key_value.py +++ b/textractor/entities/key_value.py @@ -18,6 +18,7 @@ from textractor.data.constants import TextTypes from textractor.data.text_linearization_config import TextLinearizationConfig from textractor.visualizers.entitylist import EntityList +from textractor.utils.html_utils import add_id_to_html_tag class KeyValue(DocumentEntity): @@ -235,12 +236,12 @@ def get_text_and_words( else " " ) if config.add_prefixes_and_suffixes_in_text: - text = f"{config.key_value_prefix}{config.key_prefix}{key_text}{key_suffix}{value_text}{config.key_value_suffix}" + text = f"{add_id_to_html_tag(config.key_value_prefix, self.id, config)}{config.key_prefix}{key_text}{key_suffix}{value_text}{config.key_value_suffix}" else: text = f"{key_text}{config.same_paragraph_separator}{value_text}" if config.add_prefixes_and_suffixes_as_words: - words += [Word(str(uuid.uuid4()), self.bbox, config.key_value_prefix, is_structure=True)] if config.key_value_prefix else [] + words += [Word(str(uuid.uuid4()), self.bbox, add_id_to_html_tag(config.key_value_prefix, self.id, config), is_structure=True)] if config.key_value_prefix else [] if key_words: words += ( ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(self.key), config.key_prefix, is_structure=True)] if config.key_prefix else []) + diff --git a/textractor/entities/layout.py b/textractor/entities/layout.py index 21b1daef..363cebb6 100644 --- a/textractor/entities/layout.py +++ b/textractor/entities/layout.py @@ -25,6 +25,7 @@ ) from textractor.data.text_linearization_config import TextLinearizationConfig from textractor.utils.text_utils import group_elements_horizontally, linearize_children +from textractor.utils.html_utils import add_id_to_html_tag class Layout(DocumentEntity): @@ -122,7 +123,7 @@ def get_text_and_words( ) if config.add_prefixes_and_suffixes_as_words: return ( - config.page_num_prefix + final_text + config.page_num_suffix, + add_id_to_html_tag(config.page_num_prefix, self.id, config) + final_text + config.page_num_suffix, ( ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), is_structure=True), config.page_num_prefix] if config.page_num_prefix else []) + final_words + @@ -135,17 +136,18 @@ def get_text_and_words( final_words, ) elif self.layout_type == LAYOUT_LIST: - final_text = config.list_layout_prefix + final_text = add_id_to_html_tag(config.list_layout_prefix, self.id, config) final_words = [] for i, child in enumerate( sorted(self.children, key=lambda x: x.reading_order) ): child_text, child_words = child.get_text_and_words(config) + child_prefix = add_id_to_html_tag(config.list_element_prefix, child.id, config) final_text += ( ( - config.list_element_prefix + child_prefix if ( - child_text[: len(config.list_element_prefix)] != config.list_element_prefix and + child_text[:len(child_prefix)] != child_prefix and config.add_prefixes_and_suffixes_in_text ) else "" ) @@ -162,7 +164,7 @@ def get_text_and_words( ) if config.add_prefixes_and_suffixes_as_words: final_words += ( - ([Word(str(uuid.uuid4(), BoundingBox.enclosing_bbox(child_words)), config.list_element_prefix, is_structure=True)] if config.list_element_prefix else []) + + ([Word(str(uuid.uuid4(), BoundingBox.enclosing_bbox(child_words)), add_id_to_html_tag(config.list_element_prefix, child.id, config), is_structure=True)] if config.list_element_prefix else []) + child_words + ([Word(str(uuid.uuid4(), BoundingBox.enclosing_bbox(child_words)), config.list_element_suffix, is_structure=True)] if config.list_element_suffix else []) ) @@ -174,10 +176,10 @@ def get_text_and_words( self.children, config, no_new_lines=True ) if config.add_prefixes_and_suffixes_in_text: - final_text = config.title_prefix + final_text + config.title_suffix + final_text = add_id_to_html_tag(config.title_prefix, self.id, config) + final_text + config.title_suffix if config.add_prefixes_and_suffixes_as_words: final_words = ( - ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.title_prefix, is_structure=True)] if config.title_prefix else []) + + ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), add_id_to_html_tag(config.title_prefix, self.id, config), is_structure=True)] if config.title_prefix else []) + final_words + ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.title_suffix, is_structure=True)] if config.title_suffix else []) ) @@ -187,11 +189,11 @@ def get_text_and_words( ) if config.add_prefixes_and_suffixes_in_text: final_text = ( - config.header_prefix + final_text + config.header_suffix + add_id_to_html_tag(config.header_prefix, self.id, config) + final_text + config.header_suffix ) if config.add_prefixes_and_suffixes_as_words: final_words = ( - ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.header_prefix, is_structure=True)] if config.header_prefix else []) + + ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), add_id_to_html_tag(config.header_prefix, self.id, config), is_structure=True)] if config.header_prefix else []) + final_words + ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.header_suffix, is_structure=True)] if config.header_suffix else []) ) @@ -201,11 +203,11 @@ def get_text_and_words( ) if config.add_prefixes_and_suffixes_in_text: final_text = ( - config.section_header_prefix + final_text + config.section_header_suffix + add_id_to_html_tag(config.section_header_prefix, self.id, config) + final_text + config.section_header_suffix ) if config.add_prefixes_and_suffixes_as_words: final_words = ( - ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.section_header_prefix, is_structure=True)] if config.section_header_prefix else []) + + ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), add_id_to_html_tag(config.section_header_prefix, self.id, config), is_structure=True)] if config.section_header_prefix else []) + final_words + ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.section_header_suffix, is_structure=True)] if config.section_header_suffix else []) ) @@ -216,8 +218,14 @@ def get_text_and_words( no_new_lines=True, ) final_text = ( - config.text_prefix + final_text + config.text_suffix + add_id_to_html_tag(config.text_prefix, self.id, config) + final_text + config.text_suffix ) + if config.add_prefixes_and_suffixes_as_words: + final_words = ( + ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), add_id_to_html_tag(config.text_prefix, self.id, config), is_structure=True)] if config.text_prefix else []) + + final_words + + ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.text_suffix, is_structure=True)] if config.text_suffix else []) + ) else: final_text, final_words = linearize_children( self.children, @@ -229,40 +237,40 @@ def get_text_and_words( if config.add_prefixes_and_suffixes_in_text: if self.layout_type == LAYOUT_TABLE: final_text = ( - config.table_layout_prefix + final_text + config.table_layout_suffix + add_id_to_html_tag(config.table_layout_prefix, self.id, config) + final_text + config.table_layout_suffix ) elif self.layout_type == LAYOUT_KEY_VALUE: final_text = ( - config.key_value_layout_prefix + final_text + config.key_value_layout_suffix + add_id_to_html_tag(config.key_value_layout_prefix, self.id, config) + final_text + config.key_value_layout_suffix ) elif self.layout_type == LAYOUT_FIGURE: final_text = ( - config.figure_layout_prefix + final_text + config.figure_layout_suffix + add_id_to_html_tag(config.figure_layout_prefix, self.id, config) + final_text + config.figure_layout_suffix ) elif self.layout_type == LAYOUT_ENTITY: final_text = ( - config.entity_layout_prefix + final_text + config.entity_layout_suffix + add_id_to_html_tag(config.entity_layout_prefix, self.id, config) + final_text + config.entity_layout_suffix ) elif self.layout_type == LAYOUT_FOOTER: final_text = ( - config.footer_layout_prefix + final_text + config.footer_layout_suffix + add_id_to_html_tag(config.footer_layout_prefix, self.id, config) + final_text + config.footer_layout_suffix ) if config.add_prefixes_and_suffixes_as_words: if self.layout_type == LAYOUT_TABLE: final_words = ( - ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.table_layout_prefix, is_structure=True)] if config.table_layout_prefix else []) + + ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), add_id_to_html_tag(config.table_layout_prefix, self.id, config), is_structure=True)] if config.table_layout_prefix else []) + final_words + ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.table_layout_suffix, is_structure=True)] if config.table_layout_suffix else []) ) elif self.layout_type == LAYOUT_KEY_VALUE: final_words = ( - ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.key_value_layout_prefix, is_structure=True)] if config.key_value_layout_prefix else []) + + ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), add_id_to_html_tag(config.key_value_layout_prefix, self.id, config), is_structure=True)] if config.key_value_layout_prefix else []) + final_words + ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.key_value_layout_suffix, is_structure=True)] if config.key_value_layout_suffix else []) ) elif self.layout_type == LAYOUT_FIGURE: final_words = ( - ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.figure_layout_prefix, is_structure=True)] if config.figure_layout_prefix else []) + + ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), add_id_to_html_tag(config.figure_layout_prefix, self.id, config), is_structure=True)] if config.figure_layout_prefix else []) + final_words + ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.figure_layout_suffix, is_structure=True)] if config.figure_layout_suffix else []) ) diff --git a/textractor/entities/table.py b/textractor/entities/table.py index 08f91af6..bb5b5418 100644 --- a/textractor/entities/table.py +++ b/textractor/entities/table.py @@ -32,7 +32,7 @@ from textractor.utils.text_utils import group_elements_horizontally, linearize_children from textractor.data.text_linearization_config import TextLinearizationConfig from textractor.data.html_linearization_config import HTMLLinearizationConfig - +from textractor.utils.html_utils import add_id_to_html_tag class Table(DocumentEntity): """ @@ -684,7 +684,7 @@ def get_text_and_words( if len(words_) < local_config.table_min_table_words: return linearize_children(words_, config=config) - words = [Word(str(uuid.uuid4()), self.bbox, local_config.table_prefix)] if local_config.table_prefix else [] + words = [Word(str(uuid.uuid4()), self.bbox, add_id_to_html_tag(local_config.table_prefix, self.id, local_config))] if local_config.table_prefix else [] rows = sorted([(key, list(group)) for key, group in itertools.groupby( self.table_cells, key=lambda cell: cell.row_index )], key=lambda r: r[0]) @@ -858,7 +858,7 @@ def get_text_and_words( w.table_id = str(self.id) w.table_bbox = self.bbox - text = (local_config.table_prefix if local_config.add_prefixes_and_suffixes_in_text else "") + text = (add_id_to_html_tag(local_config.table_prefix, self.id, local_config) if local_config.add_prefixes_and_suffixes_in_text else "") # Markdown if local_config.table_linearization_format == "markdown": df = self.to_pandas( diff --git a/textractor/entities/value.py b/textractor/entities/value.py index 7eb0b3b9..3a4ef1c4 100644 --- a/textractor/entities/value.py +++ b/textractor/entities/value.py @@ -19,6 +19,7 @@ from textractor.visualizers.entitylist import EntityList from textractor.data.text_linearization_config import TextLinearizationConfig from textractor.utils.text_utils import linearize_children +from textractor.utils.html_utils import add_id_to_html_tag class Value(DocumentEntity): @@ -171,7 +172,7 @@ def get_text_and_words( no_new_lines=config.remove_new_lines_in_leaf_elements, ) if config.add_prefixes_and_suffixes_in_text: - text = config.value_prefix + text + config.value_suffix + text = add_id_to_html_tag(config.value_prefix, self.id, config) + text + config.value_suffix if config.add_prefixes_and_suffixes_as_words: words = ( ( @@ -179,7 +180,7 @@ def get_text_and_words( Word( str(uuid.uuid4()), self.bbox, - config.value_prefix, + add_id_to_html_tag(config.value_prefix, self.id, config), is_structure=True, is_clickable=( bool(words) and words[0] in [config.selection_element_selected, config.selection_element_not_selected] diff --git a/textractor/utils/html_utils.py b/textractor/utils/html_utils.py new file mode 100644 index 00000000..556cf4df --- /dev/null +++ b/textractor/utils/html_utils.py @@ -0,0 +1,9 @@ +from textractor.data.constants import HTMLLinearizationConfig + +def add_id_to_html_tag(prefix, id, config, shorten=False): + if not isinstance(config, HTMLLinearizationConfig): + return prefix + if shorten: + return prefix[:-1] + f" id={id[:8]}" + prefix[-1] + else: + return prefix[:-1] + f" id={id}>" + prefix[-1] From e74a0cb89613e0340b680d51de73b034be0f0d60 Mon Sep 17 00:00:00 2001 From: Edouard Belval Date: Wed, 14 Aug 2024 18:02:08 +0000 Subject: [PATCH 2/2] Deduplicate layout and child id --- textractor/data/html_linearization_config.py | 4 ++-- textractor/parsers/response_parser.py | 6 +++--- textractor/utils/html_utils.py | 14 ++++++++------ 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/textractor/data/html_linearization_config.py b/textractor/data/html_linearization_config.py index d34e77ab..ace91611 100644 --- a/textractor/data/html_linearization_config.py +++ b/textractor/data/html_linearization_config.py @@ -80,6 +80,6 @@ class HTMLLinearizationConfig(TextLinearizationConfig): page_num_suffix: str = "" - add_ids_to_components: bool = False #: Adds Textract block id to the HTML markup. Only supported for HTML. + add_ids_to_html_tags: bool = False #: Adds Textract block id to the HTML markup. Only supported for HTML. - add_short_ids_to_components: bool = False #: Adds the truncated (first 8 characters) Textract block id to the HTML markup. Only supported for HTML \ No newline at end of file + add_short_ids_to_html_tags: bool = False #: Adds the truncated (first 8 characters) Textract block id to the HTML markup. Only supported for HTML diff --git a/textractor/parsers/response_parser.py b/textractor/parsers/response_parser.py index a0f48d9d..448cb8bb 100644 --- a/textractor/parsers/response_parser.py +++ b/textractor/parsers/response_parser.py @@ -531,7 +531,7 @@ def _create_signature_objects( if signature not in signatures_added: signatures_added.add(signature) layout = Layout( - entity_id=signature.id, + entity_id=str(uuid.uuid4()), bbox=signature.bbox, label=LAYOUT_ENTITY, reading_order=-1, @@ -1103,7 +1103,7 @@ def _create_table_objects( if table not in table_added: table_added.add(table) layout = Layout( - entity_id=table.id, + entity_id=str(uuid.uuid4()), bbox=table.bbox, label=LAYOUT_TABLE, reading_order=-1, @@ -1292,7 +1292,7 @@ def parse_document_api_response(response: dict) -> Document: if kv.id not in kv_added: kv_added.add(kv.id) layout = Layout( - entity_id=kv.id, + entity_id=str(uuid.uuid4()), bbox=kv.bbox, label=LAYOUT_KEY_VALUE, reading_order=-1, diff --git a/textractor/utils/html_utils.py b/textractor/utils/html_utils.py index 556cf4df..fa775102 100644 --- a/textractor/utils/html_utils.py +++ b/textractor/utils/html_utils.py @@ -1,9 +1,11 @@ -from textractor.data.constants import HTMLLinearizationConfig +from textractor.data.html_linearization_config import HTMLLinearizationConfig -def add_id_to_html_tag(prefix, id, config, shorten=False): - if not isinstance(config, HTMLLinearizationConfig): +def add_id_to_html_tag(prefix, id, config): + if not isinstance(config, HTMLLinearizationConfig) or not prefix: return prefix - if shorten: - return prefix[:-1] + f" id={id[:8]}" + prefix[-1] + if config.add_ids_to_html_tags: + return prefix[:-1] + f' id="{id[:8]}"' + prefix[-1] + elif config.add_short_ids_to_html_tags: + return prefix[:-1] + f' id="{id}"' + prefix[-1] else: - return prefix[:-1] + f" id={id}>" + prefix[-1] + return prefix