Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PageElement: Parse page numbers and page separators #59

Merged
merged 1 commit into from
Dec 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions sec_parser/processing_steps/title_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
TextStyle,
)
from sec_parser.semantic_elements.title_element import TitleElement
from sec_parser.semantic_elements.page_element import PageElement

if TYPE_CHECKING: # pragma: no cover
from sec_parser.semantic_elements.abstract_semantic_element import (
Expand Down Expand Up @@ -59,11 +60,18 @@ def _process_element(
"""Process each element and convert to TitleElement if necessary."""
if not isinstance(element, HighlightedTextElement):
return element

# Ensure the style is tracked
self._add_unique_style(element.style)

level = self._unique_styles_by_order.index(element.style)
level = self._unique_styles_by_order.index(element.style)

if PageElement.is_page(source = element):
return PageElement.create_from_element(
element,
log_origin=self.__class__.__name__,
)

return TitleElement.create_from_element(
element,
level=level,
Expand Down
32 changes: 32 additions & 0 deletions sec_parser/semantic_elements/page_element.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from __future__ import annotations

from sec_parser.semantic_elements import IrrelevantElement
from sec_parser.semantic_elements.abstract_semantic_element import AbstractSemanticElement
from sec_parser.processing_engine.processing_log import LogItemOrigin, ProcessingLog


class PageElement(IrrelevantElement):
"""
The PageElement class represents the page content of a paragraph or other content object.
It relates to an irrelevant element, storing page numbers and context for the document.
"""

PageNum = 0

def is_page(source: AbstractSemanticElement):
try:
if source.text.__contains__('|'):
pageNum = source.text.replace(" ","").split('|')[-1]
else:
pageNum = int(source.text.strip())
except ValueError:
return False

if PageElement.PageNum == 0:
PageElement.PageNum = int(pageNum)

if int(pageNum) == PageElement.PageNum:
PageElement.PageNum = PageElement.PageNum + 1
return True

return False
Loading