From aabdcff5dd107de186c21fabc555c0c883738df1 Mon Sep 17 00:00:00 2001 From: Asapanna Rakesh Date: Wed, 1 Nov 2023 12:14:24 +0530 Subject: [PATCH 1/9] test: add unit test for ErrorWhileProcessingElement --- ...st_abstract_elementwise_processing_step.py | 37 ++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/tests/unit/processing_steps/test_abstract_elementwise_processing_step.py b/tests/unit/processing_steps/test_abstract_elementwise_processing_step.py index 511ed79..3bf91db 100644 --- a/tests/unit/processing_steps/test_abstract_elementwise_processing_step.py +++ b/tests/unit/processing_steps/test_abstract_elementwise_processing_step.py @@ -5,11 +5,12 @@ import bs4 import pytest -from sec_parser.exceptions import SecParserValueError +from sec_parser.exceptions import SecParserError, SecParserValueError from sec_parser.processing_engine.html_tag import HtmlTag from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import ( AbstractElementwiseProcessingStep, ElementProcessingContext, + ErrorWhileProcessingElement, ) from sec_parser.semantic_elements.abstract_semantic_element import ( AbstractSemanticElement, @@ -46,6 +47,15 @@ def _process_element( return element +class ErrorRaisingProcessingStep(AbstractElementwiseProcessingStep): + def _process_element( + self, + element: AbstractSemanticElement, + _: ElementProcessingContext, + ) -> AbstractSemanticElement: + raise SecParserError + + def test_process_skip_due_to_types_to_process(): """Test that elements not in 'types_to_process' are skipped.""" # Arrange @@ -107,3 +117,28 @@ def test_process_skip_due_to_both_types_to_process_and_types_to_exclude(): assert step.seen_elements == [element1] assert processed_elements == input_elements assert processed_elements == input_elements + + +def test_error_while_processing_element(): + # Arrange + input_elements = [MockSemanticElement(Mock())] + step = ErrorRaisingProcessingStep() + + # Act + elements = step.process(input_elements) + + # Assert + assert isinstance(elements[0], ErrorWhileProcessingElement) + + +def test_error_while_processing_element_with_no_error(): + # Arrange + element = MockSemanticElement(Mock()) + + # Act & Assert + with pytest.raises(SecParserValueError): + error_processing_element = ErrorWhileProcessingElement.create_from_element( + element, + error=None, + log_origin=None + ) \ No newline at end of file From 3e1b2483e69b8b72679d9138a83cce92f259284d Mon Sep 17 00:00:00 2001 From: Asapanna Rakesh Date: Wed, 1 Nov 2023 12:59:39 +0530 Subject: [PATCH 2/9] test: add unit tests for semantic tree render and print methods. --- .../unit/semantic_tree/test_semantic_tree.py | 118 +++++++++++++----- 1 file changed, 89 insertions(+), 29 deletions(-) diff --git a/tests/unit/semantic_tree/test_semantic_tree.py b/tests/unit/semantic_tree/test_semantic_tree.py index b5c02b6..3cf1dea 100644 --- a/tests/unit/semantic_tree/test_semantic_tree.py +++ b/tests/unit/semantic_tree/test_semantic_tree.py @@ -11,6 +11,93 @@ from sec_parser.semantic_tree.tree_node import TreeNode +class MockSemanticElement(AbstractSemanticElement): + pass + + +def element(text): + t = bs4.Tag(name="p") + t.string = text + return MockSemanticElement(HtmlTag(t)) + + +def build_tree(tree_structure, parent=None): + nodes = [] + assert isinstance(tree_structure, list) + for item in tree_structure: + if isinstance(item, dict): + for key, value in item.items(): + # Create a new TreeNode for each key + node = TreeNode(element(key), parent=parent) + nodes.append(node) + # Recursively build the tree for the children of the node + if isinstance(value, list): + children = build_tree(value, parent=node) + node.add_children(children) + else: + child_node = TreeNode(element(item), parent=parent) + nodes.append(child_node) + return nodes + + +@pytest.mark.parametrize( + ("name", "tree_structure", "render_kwargs", "expected_output"), + values := [ + ("empty_tree_with_default_kwargs", [], {}, ""), + ], + ids = [v[0] for v in values], +) +def test_render(name, tree_structure, render_kwargs, expected_output): + # Arrange + root_nodes = build_tree(tree_structure) + tree = SemanticTree(root_nodes) + + # Act + actual = tree.render(**render_kwargs) + + # Assert + assert actual==expected_output + + +@pytest.mark.parametrize( + ("name", "tree_structure", "print_kwargs", "expected_output"), + values := [ + ( + "empty_tree_with_default_kwargs", + [], + {}, + "\n" + ), + ( + "simple_tree", + [{"root": ["child1", "child2"]}], + {}, + "\x1b[1;34mMockSemanticElement\x1b[0m: root\n├── \x1b[1;34mMockSemanticElement\x1b[0m: child1\n└── \x1b[1;34mMockSemanticElement\x1b[0m: child2\n" + ), + ( + "simple_tree_with_line_limit", + [{"root": ["child1", "child2"]}], + { + "line_limit": 2, + }, + "\x1b[1;34mMockSemanticElement\x1b[0m: root\n├── \x1b[1;34mMockSemanticElement\x1b[0m: child1\n", + ), + ], + ids = [v[0] for v in values], +) +def test_print(name, tree_structure, print_kwargs, expected_output, capsys): + # Arrange + root_nodes = build_tree(tree_structure) + tree = SemanticTree(root_nodes) + + # Act + tree.print(**print_kwargs) + actual = capsys.readouterr().out + + # Assert + assert actual==expected_output + + @pytest.mark.parametrize( "tree_structure,expected_nodes", [ @@ -44,24 +131,7 @@ def test_get_nodes( tree_structure: dict | list[dict], expected_nodes: list[str], ) -> None: - def build_tree(tree_structure, parent=None): - nodes = [] - assert isinstance(tree_structure, list) - for item in tree_structure: - if isinstance(item, dict): - for key, value in item.items(): - # Create a new TreeNode for each key - node = TreeNode(element(key), parent=parent) - nodes.append(node) - # Recursively build the tree for the children of the node - if isinstance(value, list): - children = build_tree(value, parent=node) - node.add_children(children) - else: - child_node = TreeNode(element(item), parent=parent) - nodes.append(child_node) - return nodes - + # Arrange root_nodes = build_tree(tree_structure) tree = SemanticTree(root_nodes) @@ -69,14 +139,4 @@ def build_tree(tree_structure, parent=None): nodes = list(tree.nodes) # Assert - assert [node.text for node in nodes] == expected_nodes - - -class MockSemanticElement(AbstractSemanticElement): - pass - - -def element(text): - t = bs4.Tag(name="p") - t.string = text - return MockSemanticElement(HtmlTag(t)) + assert [node.text for node in nodes] == expected_nodes \ No newline at end of file From 527d56c67a27915b59200599fc723b73c8302706 Mon Sep 17 00:00:00 2001 From: Asapanna Rakesh Date: Wed, 1 Nov 2023 13:10:37 +0530 Subject: [PATCH 3/9] test: add unit test case for EmptyElement. --- .../processing_steps/test_irrelevant_element_classifier.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/unit/processing_steps/test_irrelevant_element_classifier.py b/tests/unit/processing_steps/test_irrelevant_element_classifier.py index 3eaa8f2..b31d757 100644 --- a/tests/unit/processing_steps/test_irrelevant_element_classifier.py +++ b/tests/unit/processing_steps/test_irrelevant_element_classifier.py @@ -69,6 +69,8 @@

repeating

repeating

repeating

+ +

""", [ { @@ -102,6 +104,10 @@ }, ] * 10, + { + "type": EmptyElement, + "tag": "p", + }, ], ), ], From de4ff2cc484ed62be8101f0ea410f8ee53987d38 Mon Sep 17 00:00:00 2001 From: Asapanna Rakesh Date: Wed, 1 Nov 2023 13:39:27 +0530 Subject: [PATCH 4/9] test: add unit test for sec_parser transformantion history. --- .../unit/processing_engine/test_sec_parser.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/unit/processing_engine/test_sec_parser.py b/tests/unit/processing_engine/test_sec_parser.py index a92fcb3..50fcabf 100644 --- a/tests/unit/processing_engine/test_sec_parser.py +++ b/tests/unit/processing_engine/test_sec_parser.py @@ -6,6 +6,7 @@ from sec_parser.semantic_elements.composite_semantic_element import ( CompositeSemanticElement, ) +from sec_parser.processing_engine.processing_log import LogItem from sec_parser.semantic_elements.top_level_section_title import TopLevelSectionTitle from tests.unit._utils import assert_elements @@ -51,3 +52,34 @@ def test_smoke_test(name, html_str, unwrap_elements, expected_elements): mock_unwrap.assert_called() else: mock_unwrap.assert_not_called() + + +@pytest.mark.parametrize( + ("name", "html_str", "expected_processing_log"), + values := [ + ( + "simple", + "
Hello World.
", + ( + LogItem( + origin="TextClassifier", + payload={ + 'cls_name': 'TextElement' + }, + ), + ), + ), + ], + ids = [v[0] for v in values], +) +def test_transformation_history(name, html_str, expected_processing_log): + # Arrange + sec_parser = Edgar10QParser() + + # Act + processed_elements = sec_parser.parse(html_str) + processing_log = processed_elements[0].processing_log.get_items() + + # Assert + assert len(processed_elements)==1 # For simplicity, while crafting `html_str` make sure it always returns single element. + assert processing_log == expected_processing_log \ No newline at end of file From 919e2dd27571e9c61f61b995c85d3bfdce941570 Mon Sep 17 00:00:00 2001 From: Asapanna Rakesh Date: Wed, 1 Nov 2023 13:48:53 +0530 Subject: [PATCH 5/9] test: add unit test for IndividualSemanticElementExtractor with no get_checks. --- .../test_individual_semantic_element_extractor.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 tests/unit/processing_steps/individual_semantic_element_extractor/test_individual_semantic_element_extractor.py diff --git a/tests/unit/processing_steps/individual_semantic_element_extractor/test_individual_semantic_element_extractor.py b/tests/unit/processing_steps/individual_semantic_element_extractor/test_individual_semantic_element_extractor.py new file mode 100644 index 0000000..e27c866 --- /dev/null +++ b/tests/unit/processing_steps/individual_semantic_element_extractor/test_individual_semantic_element_extractor.py @@ -0,0 +1,13 @@ +import pytest + +from sec_parser.processing_steps.individual_semantic_element_extractor.individual_semantic_element_extractor import IndividualSemanticElementExtractor +from sec_parser.exceptions import SecParserValueError + + +def test_init_with_no_checks(): + # Arrange + get_checks = None + + # Act & Assert + with pytest.raises(SecParserValueError): + IndividualSemanticElementExtractor(get_checks=get_checks) \ No newline at end of file From b165acba809bd0a9aa862eeb725a0b55e5a15468 Mon Sep 17 00:00:00 2001 From: Asapanna Rakesh Date: Wed, 1 Nov 2023 13:53:41 +0530 Subject: [PATCH 6/9] test: add unit test for TableCheck. --- .../single_element_checks/test_table_check.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 tests/unit/processing_steps/individual_semantic_element_extractor/single_element_checks/test_table_check.py diff --git a/tests/unit/processing_steps/individual_semantic_element_extractor/single_element_checks/test_table_check.py b/tests/unit/processing_steps/individual_semantic_element_extractor/single_element_checks/test_table_check.py new file mode 100644 index 0000000..4685adb --- /dev/null +++ b/tests/unit/processing_steps/individual_semantic_element_extractor/single_element_checks/test_table_check.py @@ -0,0 +1,18 @@ +from unittest.mock import Mock +import bs4 +import pytest + +from sec_parser.semantic_elements.abstract_semantic_element import AbstractSemanticElement +from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.table_check import TableCheck + +def test_table_check(): + # Arrange + element = AbstractSemanticElement(Mock()) + element.html_tag.name = "table" + check = TableCheck() + + # Act + actual = check.contains_single_element(element) + + # Assert + assert actual is True \ No newline at end of file From e4563b98b9f614a5d121bf13a81112c2a92519ef Mon Sep 17 00:00:00 2001 From: Asapanna Rakesh Date: Wed, 1 Nov 2023 14:03:09 +0530 Subject: [PATCH 7/9] test: add unit test for xbrl_tag_check with ix:nonnumeric tag. --- .../test_xbrl_tag_check.py | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/tests/unit/processing_steps/individual_semantic_element_extractor/single_element_checks/test_xbrl_tag_check.py b/tests/unit/processing_steps/individual_semantic_element_extractor/single_element_checks/test_xbrl_tag_check.py index 1a2385b..fcaee5b 100644 --- a/tests/unit/processing_steps/individual_semantic_element_extractor/single_element_checks/test_xbrl_tag_check.py +++ b/tests/unit/processing_steps/individual_semantic_element_extractor/single_element_checks/test_xbrl_tag_check.py @@ -6,7 +6,7 @@ from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.xbrl_tag_check import XbrlTagCheck -def test_contains_single_element(): +def test_contains_single_element_ix_tag(): # Arrange element = AbstractSemanticElement(Mock()) element.html_tag.name = "ix" @@ -16,4 +16,20 @@ def test_contains_single_element(): actual = check.contains_single_element(element) # Assert - assert actual is False \ No newline at end of file + assert actual is False + + +def test_contains_single_element_ix_numeric_tag(): + # Arrange + element = AbstractSemanticElement(Mock()) + element.html_tag.name = "some-random-tag-name" + element.html_tag.contains_tag.side_effect = lambda tag: True if tag == "ix:nonnumeric" else False + check = XbrlTagCheck() + + # Act + actual = check.contains_single_element(element) + + # Assert + assert actual is False + +test_contains_single_element_ix_numeric_tag() \ No newline at end of file From 8776ddfdab09f683d28cf0c6a9ceb39e3df7deaf Mon Sep 17 00:00:00 2001 From: Asapanna Rakesh Date: Wed, 1 Nov 2023 14:18:52 +0530 Subject: [PATCH 8/9] test: add unit test for CompositeSemanticElement's innner_element setter. --- .../test_inner_elements.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 tests/unit/semantic_elements/composite_semantic_element/test_inner_elements.py diff --git a/tests/unit/semantic_elements/composite_semantic_element/test_inner_elements.py b/tests/unit/semantic_elements/composite_semantic_element/test_inner_elements.py new file mode 100644 index 0000000..3c20f98 --- /dev/null +++ b/tests/unit/semantic_elements/composite_semantic_element/test_inner_elements.py @@ -0,0 +1,28 @@ +import pytest + +import bs4 + +from sec_parser.processing_engine.html_tag import HtmlTag +from sec_parser.semantic_elements.composite_semantic_element import ( + CompositeSemanticElement, +) +from sec_parser.semantic_elements.semantic_elements import NotYetClassifiedElement +from sec_parser.exceptions import SecParserValueError + + +def test_inner_elements_setter(): + # Arrange + empty_elements = [] + tag = bs4.Tag(name="span") + tag.string = "A" * 60 + element = CompositeSemanticElement( + HtmlTag(tag), + inner_elements=( + NotYetClassifiedElement(HtmlTag(bs4.Tag(name="p"))), + NotYetClassifiedElement(HtmlTag(bs4.Tag(name="p"))), + ), + ) + + # Act & Assert + with pytest.raises(SecParserValueError): + element.inner_elements = None \ No newline at end of file From 7041f36b2ee0caf1fc94b9360aa71235b4cf5947 Mon Sep 17 00:00:00 2001 From: Asapanna Rakesh Date: Wed, 1 Nov 2023 14:26:32 +0530 Subject: [PATCH 9/9] test: add unit test for HighlightedTextElement's create_from_element --- .../test_highlighted_text_element.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/unit/semantic_elements/test_highlighted_text_element.py b/tests/unit/semantic_elements/test_highlighted_text_element.py index 5bbb396..1f2b6f2 100644 --- a/tests/unit/semantic_elements/test_highlighted_text_element.py +++ b/tests/unit/semantic_elements/test_highlighted_text_element.py @@ -4,6 +4,7 @@ import bs4 import pytest +from sec_parser.semantic_elements.abstract_semantic_element import AbstractSemanticElement from sec_parser.processing_engine.html_tag import HtmlTag from sec_parser.semantic_elements.highlighted_text_element import ( HighlightedTextElement, @@ -25,6 +26,18 @@ def test_highlighted_text_element_initialization(): HighlightedTextElement(mock_html_tag, style=None) +def test_highlighted_text_element_from_element(): + # Arrange + element = AbstractSemanticElement(Mock()) + + # Act & Assert + with pytest.raises( + SecParserValueError, + match="Style must be provided.", + ): + _ = HighlightedTextElement.create_from_element(element, style=None, log_origin=None) + + def test_to_dict(): # Arrange tag = bs4.Tag(name="span")