Improve HTML linearization

aws-samples · Jun 19, 2024 · 86d1dcf · 86d1dcf
2 parents 6f27f3f + 9b219d0
commit 86d1dcf
Show file tree

Hide file tree

Showing 28 changed files with 273 additions and 48 deletions.
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_amzn_q2.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_amzn_q2.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_fake_id.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_fake_id.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_form.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_form.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_form_1005.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_form_1005.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_in-table-title.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_in-table-title.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_matrix.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_matrix.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_patient_intake_form_sample.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_patient_intake_form_sample.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_paystub.jpg.json b/tests/fixtures/saved_api_responses/test_document_to_html_paystub.jpg.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_paystub_header.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_paystub_header.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_paystub_single_table.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_paystub_single_table.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_paystub_tables.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_paystub_tables.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_reading_order.pdf.json b/tests/fixtures/saved_api_responses/test_document_to_html_reading_order.pdf.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_receipt.jpg.json b/tests/fixtures/saved_api_responses/test_document_to_html_receipt.jpg.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_sample-invoice.pdf.json b/tests/fixtures/saved_api_responses/test_document_to_html_sample-invoice.pdf.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_screenshot.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_screenshot.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_single-page-1.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_single-page-1.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_single-page-2.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_single-page-2.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_test.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_test.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_textractor-singlepage-doc.pdf.json b/tests/fixtures/saved_api_responses/test_document_to_html_textractor-singlepage-doc.pdf.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_tutorial.pdf.json b/tests/fixtures/saved_api_responses/test_document_to_html_tutorial.pdf.json
diff --git a/tests/test_get_text_and_words.py b/tests/test_get_text_and_words.py
@@ -5,11 +5,13 @@
 import boto3
 import uuid
 import logging
+from lxml import html
 from tests.utils import get_fixture_path
 
 from textractor import Textractor
 from textractor.data.constants import TextractFeatures
 from textractor.data.text_linearization_config import TextLinearizationConfig
+from textractor.data.html_linearization_config import HTMLLinearizationConfig
 from textractor.entities.document import Document
 from textractor.exceptions import InvalidProfileNameError, S3FilePathMissing
 from textractor.utils.s3_utils import upload_to_s3, delete_from_s3
@@ -25,13 +27,14 @@ def setUp(self):
         self.profile_name = "default"
         if os.environ.get("CALL_TEXTRACT"):
             self.s3_client = boto3.session.Session(
-                profile_name=self.profile_name
+                #profile_name=self.profile_name
             ).client("s3", region_name="us-west-2")
 
             self.current_directory = os.path.abspath(os.path.dirname(__file__))
 
             self.extractor = Textractor(
-                profile_name=self.profile_name, kms_key_id=""
+                region_name="us-west-2"
+                #profile_name=self.profile_name, kms_key_id=""
             )
             self.fixture_directory = os.path.join(self.current_directory, "fixtures")
 
@@ -284,3 +287,41 @@ def test_figure_layout_prefixes_and_suffixes_in_text_words(self):
             "</figure>",
         ]:
             self.assertTrue(token in words, f"{token} is not in words")
+
+    def test_document_to_html(self):
+        for asset in ["amzn_q2.png", "fake_id.png", "form_1005.png", "form.png", "in-table-title.png", "matrix.png", "patient_intake_form_sample.png", "paystub_header.png", "paystub_single_table.png", "paystub_tables.png", "reading_order.pdf", "receipt.jpg", "sample-invoice.pdf", "screenshot.png", "single-page-1.png", "single-page-2.png", "test.png", "textractor-singlepage-doc.pdf"]:
+            # Testing that no asset causes the output to contain duplicate words
+            if os.environ.get("CALL_TEXTRACT"):
+                document = self.extractor.analyze_document(
+                    os.path.join(self.fixture_directory, asset),
+                    features=[
+                        TextractFeatures.LAYOUT,
+                        TextractFeatures.TABLES,
+                        TextractFeatures.FORMS,
+                        TextractFeatures.SIGNATURES
+                    ]
+                )
+                with open(get_fixture_path()[:-5] + "_" + asset + ".json", "w") as f:
+                    json.dump(document.response, f)
+            else:
+                document = Document.open(get_fixture_path()[:-5] + "_" + asset + ".json")
+
+
+            html_document = document.to_html(HTMLLinearizationConfig(
+                figure_layout_prefix="<div><p>",
+                figure_layout_suffix="</p></div>",
+                footer_layout_prefix="<div><p>",
+                footer_layout_suffix="</p></div>",
+                key_value_layout_prefix="<div><p>",
+                key_value_layout_suffix="</p></div>",
+                page_num_prefix="<div><p>",
+                page_num_suffix="</p></div>",
+            ))
+            root = html.fromstring(html_document)
+
+            for node in root.getiterator():
+                if node.text and node.tag not in ["p", "h1", "h2", "h3", "h4", "h5", "th", "td", "caption"]:
+                    raise Exception(f"Tag {node.tag} contains text {node.text}")
+
+if __name__ == "__main__":
+    TestGetTextAndWords().test_document_to_html()
diff --git a/textractor/data/html_linearization_config.py b/textractor/data/html_linearization_config.py
@@ -14,6 +14,10 @@ class HTMLLinearizationConfig(TextLinearizationConfig):
 
     title_suffix: str = "</h1>"
 
+    header_prefix: str = "<h1>"
+
+    header_suffix: str = "</h1>"
+
     section_header_prefix: str = "<h2>"
 
     section_header_suffix: str = "</h2>"
@@ -22,6 +26,10 @@ class HTMLLinearizationConfig(TextLinearizationConfig):
 
     text_suffix: str = "</p>"
 
+    entity_layout_prefix: str = "<p>"
+
+    entity_layout_suffix: str = "</p>"
+
     table_prefix: str = "<table>"
 
     table_suffix: str = "</table>"
@@ -39,3 +47,35 @@ class HTMLLinearizationConfig(TextLinearizationConfig):
     table_cell_suffix: str = "</td>"
 
     table_column_separator: str = ""
+
+    table_linearization_format: str = "html"
+
+    table_add_title_as_caption: bool = True
+
+    table_add_footer_as_paragraph: bool = True
+
+    table_column_separator: str = ""
+
+    list_layout_prefix: str = "<div>"
+
+    list_layout_suffix: str = "</div>"
+
+    table_layout_prefix: str = "<div>"
+
+    table_layout_suffix: str = "</div>"
+
+    key_value_layout_prefix: str = "<div>"
+
+    key_value_layout_suffix: str = "</div>"
+
+    figure_layout_prefix: str = "<div>"
+
+    figure_layout_suffix: str = "</div>"
+
+    footer_layout_prefix: str = "<div>"
+
+    footer_layout_suffix: str = "</div>"
+
+    page_num_prefix: str = "<div>"
+
+    page_num_suffix: str = "</div>"
diff --git a/textractor/data/text_linearization_config.py b/textractor/data/text_linearization_config.py
@@ -64,7 +64,12 @@ class TextLinearizationConfig:
 
     table_column_header_threshold: float = 0.9 #: Threshold for a row to be selected as header when rendering as markdown. 0.9 means that 90% of the cells must have the is_header_cell flag. 
 
-    table_linearization_format: str = "plaintext"  #: How to represent tables in the linearized output. Choices are plaintext, markdown or HTML.
+    table_linearization_format: str = "plaintext"  #: How to represent tables in the linearized output. Choices are plaintext, markdown or html.
+
+    table_add_title_as_caption: bool = False #: When using html linearization format, adds the title inside the table as <caption></caption>
+
+    # FIXME
+    table_add_footer_as_paragraph: bool = False 
 
     table_tabulate_format: str = "github"  #: Markdown tabulate format to use when table are linearized as markdown
 
@@ -78,6 +83,8 @@ class TextLinearizationConfig:
 
     table_column_separator: str = "\t"  #: Table column separator, used when linearizing layout tables, not used if AnalyzeDocument was called with the TABLES feature
 
+    table_flatten_semi_structured_as_plaintext: bool = False #: Ignores table structure for SEMI_STRUCTURED tables and returns them as text
+
     table_prefix: str = ""
 
     table_suffix: str = ""
@@ -106,6 +113,14 @@ class TextLinearizationConfig:
 
     table_cell_cross_merge_cell_placeholder: str = "" #: Placeholder for left merge cell (X)
 
+    table_title_prefix: str = "" #: Prefix for table title if it is outside of the table (floating)
+
+    table_title_suffix: str = "" #: Suffix for table title if it is outside of the table (floating)
+
+    table_footers_prefix: str = "" #: Prefix for table footers if they are outside of the table (floating)
+
+    table_footers_suffix: str = "" #: Suffix for table footers if they are outside of the table (floating)
+
     header_prefix: str = ""  #: Prefix for header layout elements
 
     header_suffix: str = ""  #: Suffix for header layout elements
@@ -134,10 +149,18 @@ class TextLinearizationConfig:
 
     value_suffix: str = ""  #: Suffix for value elements
 
+    entity_layout_prefix: str = "" #: Prefix for LAYOUT_ENTITY elements (layout elements without a predicted layout type)
+
+    entity_layout_suffix: str = "" #: Suffix for LAYOUT_ENTITY elements (layout elements without a predicted layout type)
+
     figure_layout_prefix: str = "" #: Prefix for figure layout elements 
 
     figure_layout_suffix: str = "" #: Suffix for figure layout elements
 
+    footer_layout_prefix: str = "" #: Prefix for figure layout elements 
+
+    footer_layout_suffix: str = "" #: Suffix for figure layout elements
+
     selection_element_selected: str = (
         "[X]"  #: Representation for selection element when selected
     )

diff --git a/textractor/entities/document.py b/textractor/entities/document.py
@@ -35,6 +35,7 @@
 )
 from textractor.utils.search_utils import SearchUtils
 from textractor.data.text_linearization_config import TextLinearizationConfig
+from textractor.data.html_linearization_config import HTMLLinearizationConfig
 from textractor.entities.linearizable import Linearizable
 
 
@@ -281,6 +282,22 @@ def page(self, page_no: int = 0):
         else:
             raise InputError("page_no parameter doesn't match required data type.")
 
+    def to_html(self, config: HTMLLinearizationConfig = HTMLLinearizationConfig()):
+        """
+        Returns the HTML representation of the document, effectively calls Linearizable.to_html()
+        but add <html><body></body></html> around the result and put each page in a <div>. 
+
+        :return: HTML text of the entity
+        :rtype: str
+        """
+
+        html = "<html><body>"
+        for page in self.pages:
+            html += f"<div>{page.to_html(config=config)}</div>"
+        html += "</body></html>"
+
+        return html
+
     def __repr__(self):
         return os.linesep.join(
             [

diff --git a/textractor/entities/key_value.py b/textractor/entities/key_value.py
@@ -48,7 +48,7 @@ def __init__(
 
         self._words: EntityList[Word] = []
         self.contains_checkbox = contains_checkbox
-        self.confidence_score = confidence / 100
+        self._confidence = confidence / 100
         self._value: Value = value
         self.selection_status = False
         self._page = None

diff --git a/textractor/entities/layout.py b/textractor/entities/layout.py
@@ -215,6 +215,9 @@ def get_text_and_words(
                 config,
                 no_new_lines=True,
             )
+            final_text = (
+                config.text_prefix + final_text + config.text_suffix
+            )
         else:
             final_text, final_words = linearize_children(
                 self.children,
@@ -236,6 +239,14 @@ def get_text_and_words(
                     final_text = (
                         config.figure_layout_prefix + final_text + config.figure_layout_suffix
                     )
+                elif self.layout_type == LAYOUT_ENTITY:
+                    final_text = (
+                        config.entity_layout_prefix + final_text + config.entity_layout_suffix
+                    )
+                elif self.layout_type == LAYOUT_FOOTER:
+                    final_text = (
+                        config.footer_layout_prefix + final_text + config.footer_layout_suffix
+                    )
             if config.add_prefixes_and_suffixes_as_words:
                 if self.layout_type == LAYOUT_TABLE:
                     final_words = (

diff --git a/textractor/entities/linearizable.py b/textractor/entities/linearizable.py
@@ -16,6 +16,8 @@ def get_text(
         """
         Returns the linearized text of the entity
 
+        :param config: Text linearization confi 
+        :type config:   
         :return: Linearized text of the entity
         :rtype: str
         """
@@ -33,26 +35,28 @@ def text(self) -> str:
         return self.get_text()
 
     def to_html(
-        self
+        self,
+        config: HTMLLinearizationConfig = HTMLLinearizationConfig()
     ) -> str:
         """
         Returns the HTML representation of the entity
 
         :return: HTML text of the entity
         :rtype: str
         """
-        return self.get_text(HTMLLinearizationConfig())
+        return self.get_text(config)
 
     def to_markdown(
-        self
+        self,
+        config: MarkdownLinearizationConfig = MarkdownLinearizationConfig()
     ) -> str:
         """
         Returns the markdown representation of the entity
 
         :return: Markdown text of the entity
         :rtype: str
         """
-        return self.get_text(MarkdownLinearizationConfig())
+        return self.get_text(config)
 
     @abstractmethod
     def get_text_and_words(