Skip to content

Commit

Permalink
Add tests for markdown tables
Browse files Browse the repository at this point in the history
  • Loading branch information
Belval committed Jun 24, 2024
1 parent 2d06c1f commit d0b00bb
Show file tree
Hide file tree
Showing 39 changed files with 70 additions and 29 deletions.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

25 changes: 23 additions & 2 deletions tests/test_get_text_and_words.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,14 @@ def setUp(self):
self.profile_name = "default"
if os.environ.get("CALL_TEXTRACT"):
self.s3_client = boto3.session.Session(
profile_name=self.profile_name
#profile_name=self.profile_name
).client("s3", region_name="us-west-2")

self.current_directory = os.path.abspath(os.path.dirname(__file__))

self.extractor = Textractor(
profile_name=self.profile_name, kms_key_id=""
#profile_name=self.profile_name, kms_key_id=""
region_name="us-west-2"
)
self.fixture_directory = os.path.join(self.current_directory, "fixtures")

Expand Down Expand Up @@ -321,6 +322,26 @@ def test_document_to_html(self):
for node in root.getiterator():
if node.text and node.tag not in ["p", "h1", "h2", "h3", "h4", "h5", "th", "td", "caption"]:
raise Exception(f"Tag {node.tag} contains text {node.text}")

def test_document_to_markdown(self):
for asset in ["amzn_q2.png", "fake_id.png", "form_1005.png", "form.png", "in-table-title.png", "matrix.png", "patient_intake_form_sample.png", "paystub_header.png", "paystub_single_table.png", "paystub_tables.png", "reading_order.pdf", "receipt.jpg", "sample-invoice.pdf", "screenshot.png", "single-page-1.png", "single-page-2.png", "test.png", "textractor-singlepage-doc.pdf"]:
# Testing that no asset causes the output to contain duplicate words
if os.environ.get("CALL_TEXTRACT"):
document = self.extractor.analyze_document(
os.path.join(self.fixture_directory, asset),
features=[
TextractFeatures.LAYOUT,
TextractFeatures.TABLES,
TextractFeatures.FORMS,
TextractFeatures.SIGNATURES
]
)
with open(get_fixture_path()[:-5] + "_" + asset + ".json", "w") as f:
json.dump(document.response, f)
else:
document = Document.open(get_fixture_path()[:-5] + "_" + asset + ".json")

document.to_markdown()

if __name__ == "__main__":
TestGetTextAndWords().test_document_to_html()
16 changes: 9 additions & 7 deletions tests/test_textractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@
class TestTextractor(unittest.TestCase):
def setUp(self):
# insert credentials and filepaths here to run test
self.profile_name = "default"
#self.profile_name = "default"
self.bucket_name = os.environ.get("S3_BUCKET", "textractor-tests")
if os.environ.get("CALL_TEXTRACT"):
self.s3_client = boto3.session.Session(
profile_name=self.profile_name
#profile_name=self.profile_name
region_name="us-west-2"
).client("s3", region_name="us-west-2")

self.current_directory = os.path.abspath(os.path.dirname(__file__))
Expand All @@ -35,12 +36,13 @@ def setUp(self):
self.image_1 = PIL.Image.open(os.path.join(self.current_directory, "fixtures/single-page-1.png"))
self.image_2 = PIL.Image.open(os.path.join(self.current_directory, "fixtures/single-page-2.png"))

if self.profile_name is None:
raise InvalidProfileNameError(
"Textractor could not be initialized. Populate profile_name with a valid input in tests/test_textractor.py."
)
#if self.profile_name is None:
# raise InvalidProfileNameError(
# "Textractor could not be initialized. Populate profile_name with a valid input in tests/test_textractor.py."
# )
self.extractor = Textractor(
profile_name=self.profile_name, kms_key_id=""
#profile_name=self.profile_name, kms_key_id=""
region_name="us-west-2"
)

def test_detect_document_text(self):
Expand Down
4 changes: 2 additions & 2 deletions tests/test_textractor_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ def run_command(cmds: List):
class TestTextractor(unittest.TestCase):
def setUp(self):
# insert credentials and filepaths here to run test
self.profile_name = "default"
#self.profile_name = "default"
self.bucket_name = os.environ.get("S3_BUCKET", "textractor-tests")
if os.environ.get("CALL_TEXTRACT"):
self.s3_client = boto3.session.Session(
profile_name=self.profile_name
#profile_name=self.profile_name
).client("s3", region_name="us-west-2")

self.current_directory = os.path.abspath(os.path.dirname(__file__))
Expand Down

0 comments on commit d0b00bb

Please sign in to comment.