From 344c404932fe03a63c5da825a4ee3cc0cfa1fab2 Mon Sep 17 00:00:00 2001 From: Li Luo <379433439@qq.com> Date: Wed, 20 Dec 2023 19:40:25 -0500 Subject: [PATCH] feat(page_element): Parse page numbers and separators (PR#59) --- .../processing_steps/title_classifier.py | 12 +++++-- sec_parser/semantic_elements/page_element.py | 32 +++++++++++++++++++ 2 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 sec_parser/semantic_elements/page_element.py diff --git a/sec_parser/processing_steps/title_classifier.py b/sec_parser/processing_steps/title_classifier.py index 7e3b0cc..927a2ec 100644 --- a/sec_parser/processing_steps/title_classifier.py +++ b/sec_parser/processing_steps/title_classifier.py @@ -11,6 +11,7 @@ TextStyle, ) from sec_parser.semantic_elements.title_element import TitleElement +from sec_parser.semantic_elements.page_element import PageElement if TYPE_CHECKING: # pragma: no cover from sec_parser.semantic_elements.abstract_semantic_element import ( @@ -59,11 +60,18 @@ def _process_element( """Process each element and convert to TitleElement if necessary.""" if not isinstance(element, HighlightedTextElement): return element - + # Ensure the style is tracked self._add_unique_style(element.style) - level = self._unique_styles_by_order.index(element.style) + level = self._unique_styles_by_order.index(element.style) + + if PageElement.is_page(source = element): + return PageElement.create_from_element( + element, + log_origin=self.__class__.__name__, + ) + return TitleElement.create_from_element( element, level=level, diff --git a/sec_parser/semantic_elements/page_element.py b/sec_parser/semantic_elements/page_element.py new file mode 100644 index 0000000..2b8084b --- /dev/null +++ b/sec_parser/semantic_elements/page_element.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from sec_parser.semantic_elements import IrrelevantElement +from sec_parser.semantic_elements.abstract_semantic_element import AbstractSemanticElement +from sec_parser.processing_engine.processing_log import LogItemOrigin, ProcessingLog + + +class PageElement(IrrelevantElement): + """ + The PageElement class represents the page content of a paragraph or other content object. + It relates to an irrelevant element, storing page numbers and context for the document. + """ + + PageNum = 0 + + def is_page(source: AbstractSemanticElement): + try: + if source.text.__contains__('|'): + pageNum = source.text.replace(" ","").split('|')[-1] + else: + pageNum = int(source.text.strip()) + except ValueError: + return False + + if PageElement.PageNum == 0: + PageElement.PageNum = int(pageNum) + + if int(pageNum) == PageElement.PageNum: + PageElement.PageNum = PageElement.PageNum + 1 + return True + + return False