Add tests for markdown tables

aws-samples · Jun 24, 2024 · d0b00bb · d0b00bb
1 parent 2d06c1f
commit d0b00bb
Show file tree

Hide file tree

Showing 39 changed files with 70 additions and 29 deletions.
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_amzn_q2.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_amzn_q2.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_fake_id.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_fake_id.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_form.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_form.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_form_1005.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_form_1005.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_in-table-title.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_in-table-title.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_matrix.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_matrix.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_patient_intake_form_sample.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_patient_intake_form_sample.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_paystub_header.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_paystub_header.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_paystub_single_table.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_paystub_single_table.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_paystub_tables.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_paystub_tables.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_reading_order.pdf.json b/tests/fixtures/saved_api_responses/test_document_to_html_reading_order.pdf.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_receipt.jpg.json b/tests/fixtures/saved_api_responses/test_document_to_html_receipt.jpg.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_sample-invoice.pdf.json b/tests/fixtures/saved_api_responses/test_document_to_html_sample-invoice.pdf.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_screenshot.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_screenshot.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_single-page-1.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_single-page-1.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_single-page-2.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_single-page-2.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_test.png.json b/tests/fixtures/saved_api_responses/test_document_to_html_test.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_html_textractor-singlepage-doc.pdf.json b/tests/fixtures/saved_api_responses/test_document_to_html_textractor-singlepage-doc.pdf.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_markdown_amzn_q2.png.json b/tests/fixtures/saved_api_responses/test_document_to_markdown_amzn_q2.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_markdown_fake_id.png.json b/tests/fixtures/saved_api_responses/test_document_to_markdown_fake_id.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_markdown_form.png.json b/tests/fixtures/saved_api_responses/test_document_to_markdown_form.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_markdown_form_1005.png.json b/tests/fixtures/saved_api_responses/test_document_to_markdown_form_1005.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_markdown_in-table-title.png.json b/tests/fixtures/saved_api_responses/test_document_to_markdown_in-table-title.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_markdown_matrix.png.json b/tests/fixtures/saved_api_responses/test_document_to_markdown_matrix.png.json
diff --git a/...ixtures/saved_api_responses/test_document_to_markdown_patient_intake_form_sample.png.json b/...ixtures/saved_api_responses/test_document_to_markdown_patient_intake_form_sample.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_markdown_paystub_header.png.json b/tests/fixtures/saved_api_responses/test_document_to_markdown_paystub_header.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_markdown_paystub_single_table.png.json b/tests/fixtures/saved_api_responses/test_document_to_markdown_paystub_single_table.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_markdown_paystub_tables.png.json b/tests/fixtures/saved_api_responses/test_document_to_markdown_paystub_tables.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_markdown_reading_order.pdf.json b/tests/fixtures/saved_api_responses/test_document_to_markdown_reading_order.pdf.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_markdown_receipt.jpg.json b/tests/fixtures/saved_api_responses/test_document_to_markdown_receipt.jpg.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_markdown_sample-invoice.pdf.json b/tests/fixtures/saved_api_responses/test_document_to_markdown_sample-invoice.pdf.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_markdown_screenshot.png.json b/tests/fixtures/saved_api_responses/test_document_to_markdown_screenshot.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_markdown_single-page-1.png.json b/tests/fixtures/saved_api_responses/test_document_to_markdown_single-page-1.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_markdown_single-page-2.png.json b/tests/fixtures/saved_api_responses/test_document_to_markdown_single-page-2.png.json
diff --git a/tests/fixtures/saved_api_responses/test_document_to_markdown_test.png.json b/tests/fixtures/saved_api_responses/test_document_to_markdown_test.png.json
diff --git a/...fixtures/saved_api_responses/test_document_to_markdown_textractor-singlepage-doc.pdf.json b/...fixtures/saved_api_responses/test_document_to_markdown_textractor-singlepage-doc.pdf.json
diff --git a/tests/test_get_text_and_words.py b/tests/test_get_text_and_words.py
@@ -27,13 +27,14 @@ def setUp(self):
         self.profile_name = "default"
         if os.environ.get("CALL_TEXTRACT"):
             self.s3_client = boto3.session.Session(
-                profile_name=self.profile_name
+                #profile_name=self.profile_name
             ).client("s3", region_name="us-west-2")
 
             self.current_directory = os.path.abspath(os.path.dirname(__file__))
 
             self.extractor = Textractor(
-                profile_name=self.profile_name, kms_key_id=""
+                #profile_name=self.profile_name, kms_key_id=""
+                region_name="us-west-2"
             )
             self.fixture_directory = os.path.join(self.current_directory, "fixtures")
 
@@ -321,6 +322,26 @@ def test_document_to_html(self):
             for node in root.getiterator():
                 if node.text and node.tag not in ["p", "h1", "h2", "h3", "h4", "h5", "th", "td", "caption"]:
                     raise Exception(f"Tag {node.tag} contains text {node.text}")
+
+    def test_document_to_markdown(self):
+        for asset in ["amzn_q2.png", "fake_id.png", "form_1005.png", "form.png", "in-table-title.png", "matrix.png", "patient_intake_form_sample.png", "paystub_header.png", "paystub_single_table.png", "paystub_tables.png", "reading_order.pdf", "receipt.jpg", "sample-invoice.pdf", "screenshot.png", "single-page-1.png", "single-page-2.png", "test.png", "textractor-singlepage-doc.pdf"]:
+            # Testing that no asset causes the output to contain duplicate words
+            if os.environ.get("CALL_TEXTRACT"):
+                document = self.extractor.analyze_document(
+                    os.path.join(self.fixture_directory, asset),
+                    features=[
+                        TextractFeatures.LAYOUT,
+                        TextractFeatures.TABLES,
+                        TextractFeatures.FORMS,
+                        TextractFeatures.SIGNATURES
+                    ]
+                )
+                with open(get_fixture_path()[:-5] + "_" + asset + ".json", "w") as f:
+                    json.dump(document.response, f)
+            else:
+                document = Document.open(get_fixture_path()[:-5] + "_" + asset + ".json")
+
+            document.to_markdown()
 
 if __name__ == "__main__":
     TestGetTextAndWords().test_document_to_html()
diff --git a/tests/test_textractor.py b/tests/test_textractor.py
@@ -17,11 +17,12 @@
 class TestTextractor(unittest.TestCase):
     def setUp(self):
         # insert credentials and filepaths here to run test
-        self.profile_name = "default"
+        #self.profile_name = "default"
         self.bucket_name = os.environ.get("S3_BUCKET", "textractor-tests")
         if os.environ.get("CALL_TEXTRACT"):
             self.s3_client = boto3.session.Session(
-                profile_name=self.profile_name
+                #profile_name=self.profile_name
+                region_name="us-west-2"
             ).client("s3", region_name="us-west-2")
 
             self.current_directory = os.path.abspath(os.path.dirname(__file__))
@@ -35,12 +36,13 @@ def setUp(self):
             self.image_1 = PIL.Image.open(os.path.join(self.current_directory, "fixtures/single-page-1.png"))
             self.image_2 = PIL.Image.open(os.path.join(self.current_directory, "fixtures/single-page-2.png"))
 
-            if self.profile_name is None:
-                raise InvalidProfileNameError(
-                    "Textractor could not be initialized. Populate profile_name with a valid input in tests/test_textractor.py."
-                )
+            #if self.profile_name is None:
+            #    raise InvalidProfileNameError(
+            #        "Textractor could not be initialized. Populate profile_name with a valid input in tests/test_textractor.py."
+            #    )
             self.extractor = Textractor(
-                profile_name=self.profile_name, kms_key_id=""
+                #profile_name=self.profile_name, kms_key_id=""
+                region_name="us-west-2"
             )
 
     def test_detect_document_text(self):

diff --git a/tests/test_textractor_cli.py b/tests/test_textractor_cli.py
@@ -26,11 +26,11 @@ def run_command(cmds: List):
 class TestTextractor(unittest.TestCase):
     def setUp(self):
         # insert credentials and filepaths here to run test
-        self.profile_name = "default"
+        #self.profile_name = "default"
         self.bucket_name = os.environ.get("S3_BUCKET", "textractor-tests")
         if os.environ.get("CALL_TEXTRACT"):
             self.s3_client = boto3.session.Session(
-                profile_name=self.profile_name
+                #profile_name=self.profile_name
             ).client("s3", region_name="us-west-2")
 
             self.current_directory = os.path.abspath(os.path.dirname(__file__))