Skip to content

Commit

Permalink
feat(page_element): Parse page numbers and separators (PR#59)
Browse files Browse the repository at this point in the history
  • Loading branch information
luoli830 authored Dec 21, 2023
1 parent 03b1afe commit 344c404
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 2 deletions.
12 changes: 10 additions & 2 deletions sec_parser/processing_steps/title_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
TextStyle,
)
from sec_parser.semantic_elements.title_element import TitleElement
from sec_parser.semantic_elements.page_element import PageElement

if TYPE_CHECKING: # pragma: no cover
from sec_parser.semantic_elements.abstract_semantic_element import (
Expand Down Expand Up @@ -59,11 +60,18 @@ def _process_element(
"""Process each element and convert to TitleElement if necessary."""
if not isinstance(element, HighlightedTextElement):
return element

# Ensure the style is tracked
self._add_unique_style(element.style)

level = self._unique_styles_by_order.index(element.style)
level = self._unique_styles_by_order.index(element.style)

if PageElement.is_page(source = element):
return PageElement.create_from_element(
element,
log_origin=self.__class__.__name__,
)

return TitleElement.create_from_element(
element,
level=level,
Expand Down
32 changes: 32 additions & 0 deletions sec_parser/semantic_elements/page_element.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from __future__ import annotations

from sec_parser.semantic_elements import IrrelevantElement
from sec_parser.semantic_elements.abstract_semantic_element import AbstractSemanticElement
from sec_parser.processing_engine.processing_log import LogItemOrigin, ProcessingLog


class PageElement(IrrelevantElement):
"""
The PageElement class represents the page content of a paragraph or other content object.
It relates to an irrelevant element, storing page numbers and context for the document.
"""

PageNum = 0

def is_page(source: AbstractSemanticElement):
try:
if source.text.__contains__('|'):
pageNum = source.text.replace(" ","").split('|')[-1]
else:
pageNum = int(source.text.strip())
except ValueError:
return False

if PageElement.PageNum == 0:
PageElement.PageNum = int(pageNum)

if int(pageNum) == PageElement.PageNum:
PageElement.PageNum = PageElement.PageNum + 1
return True

return False

0 comments on commit 344c404

Please sign in to comment.