Skip to content

Commit

Permalink
Add legacy check to parser
Browse files Browse the repository at this point in the history
  • Loading branch information
Belval committed Nov 6, 2024
1 parent 1998338 commit 94ac825
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 2 deletions.
4 changes: 2 additions & 2 deletions textractor/parsers/response_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
LAYOUT_TABLE,
LAYOUT_KEY_VALUE,
)
from textractor.utils.legacy_utils import converter

THRESHOLD = 0.95

Expand Down Expand Up @@ -1542,7 +1543,6 @@ def parser_analyze_expense_response(response):
document.response = response
return document


def parse(response: dict) -> Document:
"""
Ingests response data and API Call Mode and calls the appropriate function for it.
Expand All @@ -1559,4 +1559,4 @@ def parse(response: dict) -> Document:
if "ExpenseDocuments" in response:
return parser_analyze_expense_response(response)
else:
return parse_document_api_response(response)
return parse_document_api_response(converter(response))
51 changes: 51 additions & 0 deletions textractor/utils/legacy_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from textractor.data.constants import (
LAYOUT_FIGURE,
LAYOUT_LIST,
LAYOUT_TABLE,
LAYOUT_KEY_VALUE,
LAYOUT_TEXT,
LAYOUT_TITLE,
LAYOUT_HEADER,
LAYOUT_FOOTER,
LAYOUT_SECTION_HEADER,
LAYOUT_PAGE_NUMBER,
)

def converter(response):
blocks_to_delete = []
page_block = None
for i, block in enumerate(response["Blocks"]):
if block.get("BlockType") == "PAGE":
page_block = block
elif block.get("BlockType", "").startswith("LAYOUT_FIGURE_"):
block["BlockType"] = LAYOUT_TEXT
elif (
block.get("BlockType", "").startswith("LAYOUT_") and
block.get("BlockType") not in [
LAYOUT_TEXT,
LAYOUT_TITLE,
LAYOUT_HEADER,
LAYOUT_FOOTER,
LAYOUT_SECTION_HEADER,
LAYOUT_PAGE_NUMBER,
LAYOUT_LIST,
LAYOUT_FIGURE,
LAYOUT_TABLE,
LAYOUT_KEY_VALUE,
]
):
block["BlockType"] = LAYOUT_FIGURE
elif block.get("BlockType") == LAYOUT_FIGURE and "CONTAINER" in block.get("EntityTypes", []):
blocks_to_delete.append((i, block))

page_relationships = []
for relationship in page_block["Relationships"]:
if relationship["Type"] == "CHILD":
page_relationships = relationship["Ids"]
break

for i, block in blocks_to_delete[::-1]:
del response["Blocks"][i]
page_relationships.remove(block["Id"])

return response

0 comments on commit 94ac825

Please sign in to comment.