Skip to content

Commit

Permalink
Improve HTML linearization
Browse files Browse the repository at this point in the history
  • Loading branch information
Belval authored Jun 19, 2024
2 parents 6f27f3f + 9b219d0 commit 86d1dcf
Show file tree
Hide file tree
Showing 28 changed files with 273 additions and 48 deletions.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

45 changes: 43 additions & 2 deletions tests/test_get_text_and_words.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@
import boto3
import uuid
import logging
from lxml import html
from tests.utils import get_fixture_path

from textractor import Textractor
from textractor.data.constants import TextractFeatures
from textractor.data.text_linearization_config import TextLinearizationConfig
from textractor.data.html_linearization_config import HTMLLinearizationConfig
from textractor.entities.document import Document
from textractor.exceptions import InvalidProfileNameError, S3FilePathMissing
from textractor.utils.s3_utils import upload_to_s3, delete_from_s3
Expand All @@ -25,13 +27,14 @@ def setUp(self):
self.profile_name = "default"
if os.environ.get("CALL_TEXTRACT"):
self.s3_client = boto3.session.Session(
profile_name=self.profile_name
#profile_name=self.profile_name
).client("s3", region_name="us-west-2")

self.current_directory = os.path.abspath(os.path.dirname(__file__))

self.extractor = Textractor(
profile_name=self.profile_name, kms_key_id=""
region_name="us-west-2"
#profile_name=self.profile_name, kms_key_id=""
)
self.fixture_directory = os.path.join(self.current_directory, "fixtures")

Expand Down Expand Up @@ -284,3 +287,41 @@ def test_figure_layout_prefixes_and_suffixes_in_text_words(self):
"</figure>",
]:
self.assertTrue(token in words, f"{token} is not in words")

def test_document_to_html(self):
for asset in ["amzn_q2.png", "fake_id.png", "form_1005.png", "form.png", "in-table-title.png", "matrix.png", "patient_intake_form_sample.png", "paystub_header.png", "paystub_single_table.png", "paystub_tables.png", "reading_order.pdf", "receipt.jpg", "sample-invoice.pdf", "screenshot.png", "single-page-1.png", "single-page-2.png", "test.png", "textractor-singlepage-doc.pdf"]:
# Testing that no asset causes the output to contain duplicate words
if os.environ.get("CALL_TEXTRACT"):
document = self.extractor.analyze_document(
os.path.join(self.fixture_directory, asset),
features=[
TextractFeatures.LAYOUT,
TextractFeatures.TABLES,
TextractFeatures.FORMS,
TextractFeatures.SIGNATURES
]
)
with open(get_fixture_path()[:-5] + "_" + asset + ".json", "w") as f:
json.dump(document.response, f)
else:
document = Document.open(get_fixture_path()[:-5] + "_" + asset + ".json")


html_document = document.to_html(HTMLLinearizationConfig(
figure_layout_prefix="<div><p>",
figure_layout_suffix="</p></div>",
footer_layout_prefix="<div><p>",
footer_layout_suffix="</p></div>",
key_value_layout_prefix="<div><p>",
key_value_layout_suffix="</p></div>",
page_num_prefix="<div><p>",
page_num_suffix="</p></div>",
))
root = html.fromstring(html_document)

for node in root.getiterator():
if node.text and node.tag not in ["p", "h1", "h2", "h3", "h4", "h5", "th", "td", "caption"]:
raise Exception(f"Tag {node.tag} contains text {node.text}")

if __name__ == "__main__":
TestGetTextAndWords().test_document_to_html()
40 changes: 40 additions & 0 deletions textractor/data/html_linearization_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ class HTMLLinearizationConfig(TextLinearizationConfig):

title_suffix: str = "</h1>"

header_prefix: str = "<h1>"

header_suffix: str = "</h1>"

section_header_prefix: str = "<h2>"

section_header_suffix: str = "</h2>"
Expand All @@ -22,6 +26,10 @@ class HTMLLinearizationConfig(TextLinearizationConfig):

text_suffix: str = "</p>"

entity_layout_prefix: str = "<p>"

entity_layout_suffix: str = "</p>"

table_prefix: str = "<table>"

table_suffix: str = "</table>"
Expand All @@ -39,3 +47,35 @@ class HTMLLinearizationConfig(TextLinearizationConfig):
table_cell_suffix: str = "</td>"

table_column_separator: str = ""

table_linearization_format: str = "html"

table_add_title_as_caption: bool = True

table_add_footer_as_paragraph: bool = True

table_column_separator: str = ""

list_layout_prefix: str = "<div>"

list_layout_suffix: str = "</div>"

table_layout_prefix: str = "<div>"

table_layout_suffix: str = "</div>"

key_value_layout_prefix: str = "<div>"

key_value_layout_suffix: str = "</div>"

figure_layout_prefix: str = "<div>"

figure_layout_suffix: str = "</div>"

footer_layout_prefix: str = "<div>"

footer_layout_suffix: str = "</div>"

page_num_prefix: str = "<div>"

page_num_suffix: str = "</div>"
25 changes: 24 additions & 1 deletion textractor/data/text_linearization_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,12 @@ class TextLinearizationConfig:

table_column_header_threshold: float = 0.9 #: Threshold for a row to be selected as header when rendering as markdown. 0.9 means that 90% of the cells must have the is_header_cell flag.

table_linearization_format: str = "plaintext" #: How to represent tables in the linearized output. Choices are plaintext, markdown or HTML.
table_linearization_format: str = "plaintext" #: How to represent tables in the linearized output. Choices are plaintext, markdown or html.

table_add_title_as_caption: bool = False #: When using html linearization format, adds the title inside the table as <caption></caption>

# FIXME
table_add_footer_as_paragraph: bool = False

table_tabulate_format: str = "github" #: Markdown tabulate format to use when table are linearized as markdown

Expand All @@ -78,6 +83,8 @@ class TextLinearizationConfig:

table_column_separator: str = "\t" #: Table column separator, used when linearizing layout tables, not used if AnalyzeDocument was called with the TABLES feature

table_flatten_semi_structured_as_plaintext: bool = False #: Ignores table structure for SEMI_STRUCTURED tables and returns them as text

table_prefix: str = ""

table_suffix: str = ""
Expand Down Expand Up @@ -106,6 +113,14 @@ class TextLinearizationConfig:

table_cell_cross_merge_cell_placeholder: str = "" #: Placeholder for left merge cell (X)

table_title_prefix: str = "" #: Prefix for table title if it is outside of the table (floating)

table_title_suffix: str = "" #: Suffix for table title if it is outside of the table (floating)

table_footers_prefix: str = "" #: Prefix for table footers if they are outside of the table (floating)

table_footers_suffix: str = "" #: Suffix for table footers if they are outside of the table (floating)

header_prefix: str = "" #: Prefix for header layout elements

header_suffix: str = "" #: Suffix for header layout elements
Expand Down Expand Up @@ -134,10 +149,18 @@ class TextLinearizationConfig:

value_suffix: str = "" #: Suffix for value elements

entity_layout_prefix: str = "" #: Prefix for LAYOUT_ENTITY elements (layout elements without a predicted layout type)

entity_layout_suffix: str = "" #: Suffix for LAYOUT_ENTITY elements (layout elements without a predicted layout type)

figure_layout_prefix: str = "" #: Prefix for figure layout elements

figure_layout_suffix: str = "" #: Suffix for figure layout elements

footer_layout_prefix: str = "" #: Prefix for figure layout elements

footer_layout_suffix: str = "" #: Suffix for figure layout elements

selection_element_selected: str = (
"[X]" #: Representation for selection element when selected
)
Expand Down
17 changes: 17 additions & 0 deletions textractor/entities/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
)
from textractor.utils.search_utils import SearchUtils
from textractor.data.text_linearization_config import TextLinearizationConfig
from textractor.data.html_linearization_config import HTMLLinearizationConfig
from textractor.entities.linearizable import Linearizable


Expand Down Expand Up @@ -281,6 +282,22 @@ def page(self, page_no: int = 0):
else:
raise InputError("page_no parameter doesn't match required data type.")

def to_html(self, config: HTMLLinearizationConfig = HTMLLinearizationConfig()):
"""
Returns the HTML representation of the document, effectively calls Linearizable.to_html()
but add <html><body></body></html> around the result and put each page in a <div>.
:return: HTML text of the entity
:rtype: str
"""

html = "<html><body>"
for page in self.pages:
html += f"<div>{page.to_html(config=config)}</div>"
html += "</body></html>"

return html

def __repr__(self):
return os.linesep.join(
[
Expand Down
2 changes: 1 addition & 1 deletion textractor/entities/key_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __init__(

self._words: EntityList[Word] = []
self.contains_checkbox = contains_checkbox
self.confidence_score = confidence / 100
self._confidence = confidence / 100
self._value: Value = value
self.selection_status = False
self._page = None
Expand Down
11 changes: 11 additions & 0 deletions textractor/entities/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,9 @@ def get_text_and_words(
config,
no_new_lines=True,
)
final_text = (
config.text_prefix + final_text + config.text_suffix
)
else:
final_text, final_words = linearize_children(
self.children,
Expand All @@ -236,6 +239,14 @@ def get_text_and_words(
final_text = (
config.figure_layout_prefix + final_text + config.figure_layout_suffix
)
elif self.layout_type == LAYOUT_ENTITY:
final_text = (
config.entity_layout_prefix + final_text + config.entity_layout_suffix
)
elif self.layout_type == LAYOUT_FOOTER:
final_text = (
config.footer_layout_prefix + final_text + config.footer_layout_suffix
)
if config.add_prefixes_and_suffixes_as_words:
if self.layout_type == LAYOUT_TABLE:
final_words = (
Expand Down
12 changes: 8 additions & 4 deletions textractor/entities/linearizable.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ def get_text(
"""
Returns the linearized text of the entity
:param config: Text linearization confi
:type config:
:return: Linearized text of the entity
:rtype: str
"""
Expand All @@ -33,26 +35,28 @@ def text(self) -> str:
return self.get_text()

def to_html(
self
self,
config: HTMLLinearizationConfig = HTMLLinearizationConfig()
) -> str:
"""
Returns the HTML representation of the entity
:return: HTML text of the entity
:rtype: str
"""
return self.get_text(HTMLLinearizationConfig())
return self.get_text(config)

def to_markdown(
self
self,
config: MarkdownLinearizationConfig = MarkdownLinearizationConfig()
) -> str:
"""
Returns the markdown representation of the entity
:return: Markdown text of the entity
:rtype: str
"""
return self.get_text(MarkdownLinearizationConfig())
return self.get_text(config)

@abstractmethod
def get_text_and_words(
Expand Down
Loading

0 comments on commit 86d1dcf

Please sign in to comment.